020_定位_消除重复数据

 

 

import pandas as pd

if __name__ == '__main__':
    student = pd.read_excel("C:/Users/18124/Desktop/pandas/020_定位_消除重复数据/副本Students_Duplicates.xlsx", \
                            engine="openpyxl")
    print(student)

    # 1 删除数据 : 重复数据 - 单列匹配
    # student.drop_duplicates(subset="Name", inplace=True)

    # 2 删除数据 : 重复数据 - 多列匹配
    # student.drop_duplicates(subset=["Name", "Test_1", "Test_2"], inplace=True)

    # 3 删除数据 : 单列匹配 - 删除前面重复
    student.drop_duplicates(subset="Name", inplace=True, keep="last")   # keep="last" - 保留后面数据
    print(student)

 

 

import pandas as pd

if __name__ == '__main__':
    student = pd.read_excel("C:/Users/18124/Desktop/pandas/020_定位_消除重复数据/副本Students_Duplicates.xlsx", \
                        engine="openpyxl")
    print(student)
    
    # 1 查看重复项 - 是否存在重复
    dupe = student.duplicated(subset="Name")
    print(dupe.any())   # 判断dupe - 是否存在True

    # 2 查找重复项 - 打印信息
    dupe = dupe[dupe == True]   # 数据过滤
    print(student.iloc[dupe.index]) # 通过index定位

 

posted @ 2021-02-17 20:19  火焰马  阅读(90)  评论(0编辑  收藏  举报