数据加载、存储于文件格式:逐块读取文本文件
import pandas as pd from pandas import Series result = pd.read_csv("examples/ex6.csv") print(result) ''' one two three four key 0 0.467976 -0.038649 -0.295344 -1.824726 L 1 -0.358893 1.404453 0.704965 -0.200638 B …… 9999 -0.096376 -1.012999 -0.657431 -0.573315 0 [10000 rows x 5 columns] ''' # 只想读取几行,使用nrows result = pd.read_csv("examples/ex6.csv",nrows=5) print(result) ''' one two three four key 0 0.467976 -0.038649 -0.295344 -1.824726 L 1 -0.358893 1.404453 0.704965 -0.200638 B 2 -0.501840 0.659254 -0.421691 -0.057688 G 3 0.204886 1.074134 1.388361 -0.982404 R 4 0.354628 -0.133116 0.283763 -0.837063 Q ''' # 逐块读取,需要设置chunksize(行数) chunker = pd.read_csv("examples/ex6.csv",chunksize=1000) ''' <pandas.io.parsers.TextFileReader object at 0x007570F0> ''' tot = Series([]) for piece in chunker: # 根据设置 每1000行分一块,即每个piece为1000行 # value_counts()是一种查看表格某列中有多少个不同值的快捷方法,并计算每个不同值有在该列中有多少重复值。 # 注意:缺失值NaN与任何值相加的结果均为NaN,所以这就是为什么要用到fill_value的原因啦 # fill_value使tot中value的NaN=fill_value,然后与相同索引的value相加 tot = tot.add(piece['key'].value_counts(),fill_value=0) tot = tot.sort_values(ascending=False) print(tot[:10]) ''' E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64 '''
a.add(b,fill_value=0)
import pandas as pd import numpy as np a = pd.Series([1, 2, 3, np.nan], index=['a', 'b', 'c', 'd']) b = pd.Series([1.1,np.nan,3.1,np.nan], index=['a', 'b', 'd', 'e']) print(a.add(b,fill_value=0)) ''' a 2.1 b 2.0 c 3.0 d 3.1 e NaN dtype: float64 ''' print(a.add(b)) ''' a 2.1 b NaN c NaN d NaN e NaN dtype: float64 '''
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/14419795.html