大数据——重复清洗

import pandas as pd
from sqlalchemy import create_engine

# 从数据库中读取结果表数据到 DataFrame 中
engine = create_engine('mysql+pymysql://root:wwsa20030207@localhost/python_bigdata')
result_df = pd.read_sql('SELECT * FROM result_with_dimension', con=engine)

# 确定重复记录并进行清洗
duplicate_records = result_df[result_df.duplicated(subset=['地域', '成果名称'], keep=False)]  # 找出重复记录

for idx, row in duplicate_records.iterrows():
    # 保留一条记录
    if idx == duplicate_records.index[0]:
        continue  # 跳过第一条记录,作为保留的记录

    # 补充独有字段内容
    for column in result_df.columns:
        if pd.isna(result_df.loc[idx, column]) and not pd.isna(row[column]):
            result_df.at[idx, column] = row[column]

# 删除其余记录
result_df.drop_duplicates(subset=['地域', '成果名称'], keep='first', inplace=True)

# 将处理后的数据写入 MySQL 数据库
result_df.to_sql('result_cleaned', con=engine, if_exists='replace', index=False)

# 关闭连接
engine.dispose()
posted @ 2024-03-25 22:56  会秃头的小白  阅读(3)  评论(0编辑  收藏  举报