Pandas——比较两个dataframe之间的区别
import pandas as pd import datacompy #导入datafcompy库 比较两个dataframe之间的区别 #同时也可以比较两个txt之间的区别 df1 = pd.read_csv("1.txt",header=None) df2 = pd.read_csv("22.txt",header=None) dd = datacompy.Compare(df1,df2,join_columns=0) print(dd.report())
结果
DataComPy Comparison
--------------------
DataFrame Summary
-----------------
DataFrame Columns Rows
0 df1 1 2
1 df2 1 6
Column Summary
--------------
Number of columns in common: 1
Number of columns in df1 but not in df2: 0
Number of columns in df2 but not in df1: 0
Row Summary
-----------
Matched on: 0
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 2
Number of rows in df1 but not in df2: 0
Number of rows in df2 but not in df1: 4
Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 2
Column Comparison
-----------------
Number of columns compared with some values unequal: 0
Number of columns compared with all values equal: 1
Total number of values which compare unequal: 0
Sample Rows Only in df2 (First 10 Columns)
------------------------------------------
0
3 vasdj
4 顺嘿嘿
5 顺顺
2 afdlkaewlhg
import pandas as pd import datacompy import glob import os all_files1 = glob.glob(r"C:\Users\15773\Desktop\test\1\*.txt") all_files2 = glob.glob(r"C:\Users\15773\Desktop\test\2\*.txt") for file1 in all_files1: file1_basename = os.path.basename(file1) for file2 in all_files2: file2_basename = os.path.basename(file2) if file1_basename == file2_basename: df1 = pd.read_csv(file1,header=None) df2 = pd.read_csv(file2,header=None) dd = datacompy.Compare(df1,df2,join_columns=0) report = dd.report() print(df1) print(df2) print(report) txt_name = str(file1_basename)+"_result.txt" result_txt = open(txt_name,'w') result_txt.write(report) result_txt.close() print("process done")
from io import StringIO import pandas as pd import datacompy data1 = """acct_id,dollar_amt,name,float_fld,date_fld 10000001234,123.45,George Maharis,14530.1555,2017-01-01 10000001235,0.45,Michael Bluth,1,2017-01-01 10000001236,1345,George Bluth,,2017-01-01 10000001237,123456,Bob Loblaw,345.12,2017-01-01 10000001239,1.05,Lucille Bluth,,2017-01-01 """ data2 = """acct_id,dollar_amt,name,float_fld 10000001234,123.4,George Michael Bluth,14530.155 10000001235,0.45,Michael Bluth, 10000001236,1345,George Bluth,1 10000001237,123456,Robert Loblaw,345.12 10000001238,1.05,Loose Seal Bluth,111 """ df1 = pd.read_csv(StringIO(data1)) df2 = pd.read_csv(StringIO(data2)) compare = datacompy.Compare( df1, df2, join_columns='acct_id', #You can also specify a list of columns abs_tol=0, #Optional, defaults to 0 rel_tol=0, #Optional, defaults to 0 df1_name='Original', #Optional, defaults to 'df1' df2_name='New' #Optional, defaults to 'df2' ) compare.matches(ignore_extra_columns=False) # False # This method prints out a human-readable report summarizing and sampling differences print(compare.report())
DataComPy Comparison
--------------------
DataFrame Summary
-----------------
DataFrame Columns Rows
0 Original 5 5
1 New 4 5
Column Summary
--------------
Number of columns in common: 4
Number of columns in Original but not in New: 1
Number of columns in New but not in Original: 0
Row Summary
-----------
Matched on: acct_id
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 4
Number of rows in Original but not in New: 1
Number of rows in New but not in Original: 1
Number of rows with some compared columns unequal: 4
Number of rows with all compared columns equal: 0
Column Comparison
-----------------
Number of columns compared with some values unequal: 3
Number of columns compared with all values equal: 1
Total number of values which compare unequal: 6
Columns with Unequal Values or Types
------------------------------------
Column Original dtype New dtype # Unequal Max Diff # Null Diff
2 dollar_amt float64 float64 1 0.0500 0
0 float_fld float64 float64 3 0.0005 2
1 name object object 2 0.0000 0
Sample Rows with Unequal Values
-------------------------------
acct_id float_fld (Original) float_fld (New)
2 10000001236 NaN 1.000
0 10000001234 14530.1555 14530.155
1 10000001235 1.0000 NaN
acct_id name (Original) name (New)
3 10000001237 Bob Loblaw Robert Loblaw
0 10000001234 George Maharis George Michael Bluth
acct_id dollar_amt (Original) dollar_amt (New)
0 10000001234 123.45 123.4
Sample Rows Only in Original (First 10 Columns)
-----------------------------------------------
acct_id dollar_amt name float_fld date_fld
4 10000001239 1.05 Lucille Bluth NaN 2017-01-01
Sample Rows Only in New (First 10 Columns)
------------------------------------------
acct_id dollar_amt name float_fld
5 10000001238 1.05 Loose Seal Bluth 111.0