数据加载、存储于文件格式:读写文本格式的数据

import pandas as pd

df1 = pd.read_csv("examples\ex1.csv")
print(df1)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
df2 = pd.read_table("examples\ex1.csv") # 默认分隔符"\t"
print(df2)
'''
  a,b,c,d,message
0   1,2,3,4,hello
1   5,6,7,8,world
2  9,10,11,12,foo
'''
df3 = pd.read_table("examples\ex1.csv",sep=',') # 指定分隔符
print(df3)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
# 无列名时候pandas分配默认列名
df4 = pd.read_csv("examples\ex2.csv",header=None)
print(df4)
'''
   0   1   2   3      4
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo
'''
# 自定义列名
df5 = pd.read_csv("examples\ex2.csv",names=['a','b','c','d','message'])
print(df5)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
# 把某列设置为索引,可通过index_col参数
names=['a','b','c','d','message']
df6 = pd.read_csv("examples\ex2.csv",names=names,index_col='message')
print(df6)
'''
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12
'''
# 层次化索引,传入列编号或者列名组成的列表
parsed = pd.read_csv("examples/csv_mindex.csv",index_col=['key1','key2'])
print(parsed)
'''
           value1  value2
key1 key2                
one  a          1       2
     b          3       4
     c          5       6
     d          7       8
two  a          9      10
     b         11      12
     c         13      14
     d         15      16
'''

# 数量不封的空白分隔符,可使用正则表达式\s+表示
print(list(open("examples/ex3.txt")))
'''
['            A         B         C\n', 
'aaa -0.264438 -1.026059 -0.619500\n', 
'bbb  0.927272  0.302904 -0.032399\n', 
'ccc -0.264273 -0.386314 -0.217601\n', 
'ddd -0.871858 -0.348382  1.100491\n']
'''
result = pd.read_table("examples/ex3.txt",sep='\s+')
print(result)
'''
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491
'''
# 可用skiprows跳过某些行
df7 = pd.read_csv("examples/ex4.csv",skiprows=[0,2,3])
print(df7)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
# 缺失值处理
result = pd.read_csv("examples/ex5.csv")
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''
print(pd.isnull(result))
'''
   something      a      b      c      d  message
0      False  False  False  False  False     True
1      False  False  False   True  False    False
2      False  False  False  False  False    False
'''
# na_values可以接受一组用于表示缺失值的字符串
result = pd.read_csv("examples/ex5.csv",na_values=["NULL"])
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''
# 可用一个字典为各列指定不同的NA标记值
sentinels = {'message':['foo','NA'],'something':['two','three']}
result = pd.read_csv("examples/ex5.csv",na_values=sentinels)
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       NaN  5   6   NaN   8   world
2       NaN  9  10  11.0  12     NaN
'''

 

  • sep或delimiter:用于对各行字段拆分的字符串或正则表达式
  • header:列名行号,默认0为第一行,没有没有header应该设置为None
  • index_col:用作行索引的列编号或者列名
  • names:列名列表,结合header=None
  • skiprows:需要跳过的行
  • na_values:一组用于替换NA的值
  • comment:将注释信息用行位拆分出去
  • nrows:需要读取的行数,从文件开始算起
  • skip_footer:需要忽略的行数,从文件末尾算起 

 

posted @ 2021-02-20 11:07  OTAKU_nicole  阅读(51)  评论(0编辑  收藏  举报