import json
import pandas as pd
from crawler.db.redis_client import redis_cli
file2 = "2.parquet"
set_key = "agi_laion_parquet_update"
# df = pq.read_pandas(file2).to_pandas()
# print(df)
# 定义列表存储读取的元素
elements = []
batch_size = 1000
cursor = 0
def gen_part_data(data_list):
for i in data_list:
item = json.loads(i)
redis_cli.srem(set_key, i)
yield item
# 循环读取所有元素
while True:
cursor, batch = redis_cli.sscan(set_key, cursor=cursor, count=batch_size)
elements.extend(gen_part_data(batch))
if cursor == 0:
break
df = pd.DataFrame.from_dict(elements)
df.to_parquet(file2)