csv文件读取

In [1]:
import pandas as pd
import sys
In [2]:
%cat examples/ex2.csv
 
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
In [3]:
#文件没有标签数据
pd.read_csv('examples/ex2.csv',header=None)
Out[3]:
 
 01234
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [4]:
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'])
Out[4]:
 
 abcdmassage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [5]:
#将其中的一列设为索引列
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'],index_col='massage')
Out[5]:
 
 abcd
massage    
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [6]:
list(open('examples/ex3.txt'))
Out[6]:
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
In [7]:
#多出的一列数据自动识别为索引,分隔符不同使用正则表达式
pd.read_csv('examples/ex3.csv',sep='\s+')
Out[7]:
 
 ABC
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
 

将读取的非空数据设为NaN

In [8]:
%cat examples/ex5.csv
 
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
In [9]:
pd.read_csv('examples/ex5.csv',na_values={'something':'two','massage':['NA','foo']})
Out[9]:
 
 somethingabcdmessage
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [10]:
#只读取一部分数据
pd.read_csv('examples/ex6.csv',nrows=10)
Out[10]:
 
 onetwothreefourkey
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
5 1.817480 0.742273 0.419395 -2.251035 Q
6 -0.776764 0.935518 -0.332872 -1.875641 U
7 -0.913135 1.530624 -0.572657 0.477252 K
8 0.358480 -0.497572 -0.367016 0.507702 S
9 -1.740877 -1.160417 -1.637830 2.172201 G
In [11]:
#将数据分块读取
chunker = pd.read_csv('examples/ex6.csv',chunksize=1000)
for piece in chunker:
    print(piece.iloc[0])
 
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              L
Name: 0, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              T
Name: 1000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 2000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 3000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 4000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 5000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              I
Name: 6000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 7000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              7
Name: 8000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              B
Name: 9000, dtype: object
 

写入csv文件

In [12]:
data = pd.read_csv('examples/ex5.csv')
data.to_csv(sys.stdout)
 
,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo
In [13]:
data.to_csv(sys.stdout,sep='|')
 
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
In [14]:
#对缺失值进行标识
data.to_csv(sys.stdout,na_rep='NULL')
 
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
In [15]:
data.to_csv(sys.stdout,index=False,header=False)
 
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
In [16]:
#只写入子集
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
 
a,b,c
1,2,3.0
5,6,
9,10,11.0
 

json文件

In [17]:
%cat examples/example.json
 
[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]
In [18]:
data = pd.read_json('examples/example.json')
data
Out[18]:
 
 abc
0 1 2 3
1 4 5 6
2 7 8 9
In [19]:
data.to_json(sys.stdout)
 
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
In [20]:
#按行写入
data.to_json(sys.stdout,orient='records')
 
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
 

HTML

In [21]:
#搜索并解析包含在table标签中的数据
tables = pd.read_html('examples/fdic_failed_bank_list.html')
#只有一张表格
len(tables)
Out[21]:
1
In [22]:
data = tables[0]
data.head()
Out[22]:
 
 Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
 

excel

In [23]:
data = pd.read_excel('examples/ex1.xlsx','Sheet1')
data
Out[23]:
 
 Unnamed: 0abcdmessage
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo
In [24]:
data.to_excel('examples/ex2.xlsx')
 

Web API

In [25]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp
Out[25]:
<Response [200]>
In [26]:
data = resp.json()#data为字典数组
issues = pd.DataFrame(data,columns=['title','url','state','labels'])#提取部分字段
issues.head()
Out[26]:
 
 titleurlstatelabels
0 Resample on MultiIndex level is much longer th... https://api.github.com/repos/pandas-dev/pandas... open []
1 BUG: value_counts can handle the case even wit... https://api.github.com/repos/pandas-dev/pandas... open []
2 Build error on Arch linux https://api.github.com/repos/pandas-dev/pandas... open []
3 BUG: DataFrame.to_html validates formatters ha... https://api.github.com/repos/pandas-dev/pandas... open []
4 BUG DataFream.rindex方法参数的错误 https://api.github.com/repos/pandas-dev/pandas... open [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...