数据加载、存储于文件格式:逐块读取文本文件

import pandas as pd
from pandas import Series

result = pd.read_csv("examples/ex6.csv")
print(result)
'''
           one       two     three      four key
0     0.467976 -0.038649 -0.295344 -1.824726   L
1    -0.358893  1.404453  0.704965 -0.200638   B
……
9999 -0.096376 -1.012999 -0.657431 -0.573315   0
[10000 rows x 5 columns]
'''
# 只想读取几行,使用nrows
result = pd.read_csv("examples/ex6.csv",nrows=5)
print(result)
'''
        one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
2 -0.501840  0.659254 -0.421691 -0.057688   G
3  0.204886  1.074134  1.388361 -0.982404   R
4  0.354628 -0.133116  0.283763 -0.837063   Q
'''

# 逐块读取,需要设置chunksize(行数)
chunker = pd.read_csv("examples/ex6.csv",chunksize=1000)
'''
<pandas.io.parsers.TextFileReader object at 0x007570F0>
'''
tot = Series([])
for piece in chunker:
    # 根据设置 每1000行分一块,即每个piece为1000行
    # value_counts()是一种查看表格某列中有多少个不同值的快捷方法,并计算每个不同值有在该列中有多少重复值。
    # 注意:缺失值NaN与任何值相加的结果均为NaN,所以这就是为什么要用到fill_value的原因啦
    # fill_value使tot中value的NaN=fill_value,然后与相同索引的value相加
    tot = tot.add(piece['key'].value_counts(),fill_value=0)
tot = tot.sort_values(ascending=False)
print(tot[:10])
'''
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64
'''

 

a.add(b,fill_value=0)
import pandas as pd
import numpy as np
a = pd.Series([1, 2, 3, np.nan], index=['a', 'b', 'c', 'd'])
b = pd.Series([1.1,np.nan,3.1,np.nan], index=['a', 'b', 'd', 'e'])
print(a.add(b,fill_value=0))
'''
a    2.1
b    2.0
c    3.0
d    3.1
e    NaN
dtype: float64
'''
print(a.add(b))
'''
a    2.1
b    NaN
c    NaN
d    NaN
e    NaN
dtype: float64
'''

 

posted @ 2021-02-20 11:40  OTAKU_nicole  阅读(158)  评论(0编辑  收藏  举报