Data
Stay hungry,Stay foolish!

导航

 

数据的保存

import pandas as pd
import numpy as np
from pandas import Series

col_db = [['one',1,2,3,4,np.nan],['two',5,6,8,'world',np.nan],['three',9,10,11,12,'foo']]
data = pd.DataFrame(col_db,columns = ['somthing','a','b','c','d','message'])
data

    somthing	a	b	c	d	message
0	one	1	2	3	4	NaN
1	two	5	6	8	world	NaN
2	three	9	10	11	12	foo

# 利用DataFrame的to_csv方法,csv默认为逗号分隔
data.to_csv('save.csv')
# 指定分隔符
data.to_csv('save_.csv',sep='|')
# 缺失值在输出结果中会被表示为空字符串,你可以指定
data.to_csv('save_1.csv',na_rep='NULL')
# 禁用行和列的标签,只保留数据
data.to_csv('save_2.csv',header=False,index=False)
# 输出指定的列,并以指定的顺序排列
data.to_csv('save_2.csv',index=False,columns=['a','b','c'])
# Series也有一个to_csv方法,from_csv可以直接读取csv
Series.from_csv('save_2.csv')

a     b
1     2
5     6
9    10
dtype: object

手工处理分隔符格式

大部分的表格型数据都能用pd.read_table进行加载,但是由于含有畸形行的文件而使read_table出毛病的情况并不少见
例如如下的格式文件:

a, b, c d
1, 2, 3
1, 2, 3, 4
import csv
# 直接使用pd.read_csv会报错,这里需要引入csv模块,进行处理
f = open('save_2.csv')
# 将已打开的文件型对象传给csv.reader
reader = csv.reader(f)
for line in reader:
    print(line,type(line))

['a', 'b', 'c'] <class 'list'>
['1', '2', '3'] <class 'list'>
['5', '6', '8', '10'] <class 'list'>

# 整理这个reader
lines = list(csv.reader(open('save_2.csv')))
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['5', '6', '8', '10']]

header, values = lines[0], lines[1:]
# 压缩为元组,再把值恢复矩阵
p = zip(header, zip(*values))
for i in p:
    print(i)

('a', ('1', '5'))
('b', ('2', '6'))
('c', ('3', '8'))

# 字典推导式
{h:v for h,v in zip(header, zip(*values))}

{'a': ('1', '5'), 'b': ('2', '6'), 'c': ('3', '8')}

# 手工输出分隔符文件,可以使用csv.writer
# w模式会覆盖并重新生成
with open('save_2.csv','w') as f:
    writer = csv.writer(f)
    writer.writerow(('new_1','new_2'))

JSON数据

如何将JSON对象转为DataFrame或其他便于分析的数据结构


import json
obj = '''
{
"name":"wes",
"places_lived":["United Statues","Spain","Germany"],
"pet": null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
    {"name":"Katie","age":33,"pet":"Cisco"}]
}
'''
# 选取一部分符合dataFrame格式的
result = json.loads(obj)
pd.DataFrame(result['siblings'])

    age	name	pet
0	25	Scott	Zuko
1	33	Katie	Cisco

使用数据库

# 导入内置的SQLite数据库
import sqlite3
query = '''
CREATE TABLE test
(
a VARCHAR(20),
b VARCHAR(20),
c REAL,
d INT
);
'''

# 直接在内存中创建
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

# 插入几行数据
data = [('Atlanta','Georgia',1.25,6),
        ('Tallahassee','Florida',2.6,3),
        ('Sacramento','California',1.7,5)
       ]
stmt = 'INSERT INTO test VALUES(?,?,?,?)'
con.executemany(stmt,data)
con.commit()

# 从表中选取数据
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

 #取出列表名
cursor.description
(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

 # zip(*)返回矩阵,与zip作用相反
k = zip(*cursor.description)
# for i in k:
#     print(i)
# 直接使用k[0]会报错,zip对象不支持'zip' object is not subscriptable,需要借助list包装
list(k)[0]

('a', 'b', 'c', 'd')

pd.DataFrame(rows,columns=list(zip(*cursor.description))[0])

    a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5
pandas有一个可以简化上面过程的read_sql函数,只需要传入select语句链接对象即可
import pandas.io.sql as sql
sql.read_sql('select * from test',con)

    a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

# pandas链接mysql同理
import pymysql

conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='123456',db='taobao',charset='utf8')
sql.read_sql('select * from tblive2',conn)
存取MongoDB中的数据
import pymongo

# 创建链接对象
con2 = pymongo.MongoClient('localhost',port=27017)

# 链接数据库
db = con2.wechat_spider

# 使用posts集合(这是我电脑里的)
find_list = db.posts.find()
# 这里由于不能直接传入迭代对象,需要借助list
pd.DataFrame(list(find_list))
posted on 2018-11-21 16:40  进击中的青年  阅读(2200)  评论(0编辑  收藏  举报