Pandas读取文件
# 1. 使用to_excel创建Excel文件
import pandas as pd
df = pd.DataFrame({'id':[1,2,3],'name':['zs','ls','ww']})
# 默认会有索引,将ID列设置成索引,会返回一个新的df,如果想要在原来的df上修改需要添加参数inplace=True
df = df.set_index('id')
df.to_excel('./output.xlsx')
print('end')
# 2. 使用pandas读取文件
import pandas as pd
# 此处需要安装依赖库xlrd
people = pd.read_excel('~/Desktop/People.xlsx')
print('获取文件中的行和列:',people.shape)
print("-"*20)
print('获取文件中的列名:',people.columns)
print("-"*20)
# 默认取前五行
print('获取文件中的前几行数据信息:',people.head())
print("-"*20)
print('获取文件中的后几行数据信息:',people.tail())
print("-"*20)
# 注意常见问题:
# 1. 读取的时候,默认会将第一行作为列名,我们可以修改
people = pd.read_excel('~/Desktop/People.xlsx',header = 1)
print(people.columns)
输出:
获取文件中的行和列: (19972, 6)
--------------------
获取文件中的列名: Index(['ID', 'Type', 'Title', 'FirstName', 'MiddleName', 'LastName'], dtype='object')
--------------------
获取文件中的前几行数据信息: ID Type Title FirstName MiddleName LastName
0 1 Employee NaN Ken J Sánchez
1 2 Employee NaN Terri Lee Duffy
2 3 Employee NaN Roberto NaN Tamburello
3 4 Employee NaN Rob NaN Walters
4 5 Employee Ms. Gail A Erickson
--------------------
获取文件中的后几行数据信息: ID Type Title FirstName MiddleName LastName
19967 20773 Individual Customer NaN Crystal NaN Guo
19968 20774 Individual Customer NaN Isabella F Richardson
19969 20775 Individual Customer NaN Crystal S He
19970 20776 Individual Customer NaN Crystal NaN Zheng
19971 20777 Individual Customer NaN Crystal NaN Hu
--------------------
Index([1, 'Employee', 'NULL', 'Ken', 'J', 'Sánchez'], dtype='object')
# 2. 使用pandas读取文件
import pandas as pd
#2. 如果第一行或者其他行不满足我们的需求时,我们可以自定义
# 第一种: 设置header为None,会使用默认的01234
people = pd.read_excel('~/Desktop/People.xlsx',header = None)
print(people.columns)
print("-"*20)
print(people.head())
print("-"*20)
# 第二种: 认为的设置默认值
people.columns = ['ID1','Type1','Title1','FirstName1','MiddleName1','LastName1']
print(people.columns)
print("-"*20)
print(people.head())
print("-"*20)
# 重新存储
people.set_index('ID1',inplace = True)
print(people.head())
print("-"*20)
people.to_excel('./People1.xlsx')
print('end')
print("-"*20)
# 注意读取数据的时候,会将ID1右作为一列输出出来,所以可以在读取的时候用参数指定一下
people1 = pd.read_excel('./People1.xlsx',index_col = "ID1")
print(people1.head())
输出:
Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')
--------------------
0 1 2 3 4 5
0 ID Type Title FirstName MiddleName LastName
1 1 Employee NaN Ken J Sánchez
2 2 Employee NaN Terri Lee Duffy
3 3 Employee NaN Roberto NaN Tamburello
4 4 Employee NaN Rob NaN Walters
--------------------
Index(['ID1', 'Type1', 'Title1', 'FirstName1', 'MiddleName1', 'LastName1'], dtype='object')
--------------------
ID1 Type1 Title1 FirstName1 MiddleName1 LastName1
0 ID Type Title FirstName MiddleName LastName
1 1 Employee NaN Ken J Sánchez
2 2 Employee NaN Terri Lee Duffy
3 3 Employee NaN Roberto NaN Tamburello
4 4 Employee NaN Rob NaN Walters
--------------------
Type1 Title1 FirstName1 MiddleName1 LastName1
ID1
ID Type Title FirstName MiddleName LastName
1 Employee NaN Ken J Sánchez
2 Employee NaN Terri Lee Duffy
3 Employee NaN Roberto NaN Tamburello
4 Employee NaN Rob NaN Walters
--------------------
end
--------------------
Type1 Title1 FirstName1 MiddleName1 LastName1
ID1
ID Type Title FirstName MiddleName LastName
1 Employee NaN Ken J Sánchez
2 Employee NaN Terri Lee Duffy
3 Employee NaN Roberto NaN Tamburello
4 Employee NaN Rob NaN Walters
import pandas as pd
# 指定读哪个表
sheet = pd.read_excel('~/Desktop/sheet.xlsx',sheet_name='sheet2')
print(sheet.head())
print("-"*20)
# 3. 如果数据在表格中没有顶格写时
# skiprows : 跳过几行
# usecols: 使用那几列(C,指的就是Excel上的ABCD....)
book = pd.read_excel('~/Desktop/Books.xlsx',skiprows=3,usecols ="C:F")
print(book.head())
输出:
ID age
0 0 18
1 1 19
--------------------
ID Name InStore
0 NaN Book_001 NaN
1 NaN Book_002 NaN
2 NaN Book_003 NaN
3 NaN Book_004 NaN
4 NaN Book_005 NaN
import pandas as pd
book = pd.read_excel('~/Desktop/Books.xlsx',skiprows=3,usecols ="C:F")
print(book['ID'])
print("-"*20)
# 修改ID 的值
book["ID"].at[0] = 1
print(book['ID'])
print("-"*20)
''' at和iat函数是只能选择某个位置的值,iat是按照行索引和列索引的位置来选取数据的。而at是按照行索引和列索引来选 取数据;
loc和iloc函数的功能包含at和iat函数的功能
'''
# 使用for循环给ID列添加内容
for i in book.index:
book["ID"].at[i] = i+1
# # 注意: 添加后ID由于默认是float类型,修改一下类型
book["ID"]= book["ID"].astype(int)
print(book['ID'])
print("-"*20)
# # 给inStore添加交替值
book["InStore"] = book["InStore"].astype(str)
for i in book.index:
book["InStore"].at[i] = 'Yes' if i%2==0 else "No"
book.set_index('ID',inplace=True)
book.to_excel('./books1.xlsx')
输出:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
13 NaN
14 NaN
15 NaN
16 NaN
17 NaN
18 NaN
19 NaN
Name: ID, dtype: float64
--------------------
0 1.0
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
13 NaN
14 NaN
15 NaN
16 NaN
17 NaN
18 NaN
19 NaN
Name: ID, dtype: float64
--------------------
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
Name: ID, dtype: int64
原始数据:
修改后:
函数填充计算列
import pandas as pd
books = pd.read_excel('~/Desktop/04Books.xlsx',index_col = 'ID')
print(books)
print("-"*20)
# 计算Price的值(这种方法是列与列之间对齐后进行计算)
books["Price"] = books['OnePrice'] * books['Count']
print(books)
print("-"*20)
# 如果只想算某一段就可以,使用循环迭代(是单元格与单元格之间的操作)
for i in range(5,16):
books["Price"].at[i] = books["OnePrice"].at[i] * books["Count"].at[i]
print(books)
# 如果想修改原文件,直接写回去就可以
books.to_excel('./05Books.xlsx')
输出:
Name OnePrice Count Price
ID
1 Product_001 9.82 5 NaN
2 Product_002 11.99 4 NaN
3 Product_003 9.62 6 NaN
4 Product_004 11.08 8 NaN
5 Product_005 7.75 3 NaN
6 Product_006 7.34 4 NaN
7 Product_007 10.97 6 NaN
8 Product_008 11.14 7 NaN
9 Product_009 8.98 2 NaN
10 Product_010 9.18 3 NaN
11 Product_011 8.31 4 NaN
12 Product_012 7.29 9 NaN
13 Product_013 8.36 5 NaN
14 Product_014 9.16 6 NaN
15 Product_015 10.31 3 NaN
16 Product_016 10.26 6 NaN
17 Product_017 11.95 8 NaN
18 Product_018 11.22 2 NaN
19 Product_019 10.95 4 NaN
20 Product_020 8.82 6 NaN
--------------------
Name OnePrice Count Price
ID
1 Product_001 9.82 5 49.10
2 Product_002 11.99 4 47.96
3 Product_003 9.62 6 57.72
4 Product_004 11.08 8 88.64
5 Product_005 7.75 3 23.25
6 Product_006 7.34 4 29.36
7 Product_007 10.97 6 65.82
8 Product_008 11.14 7 77.98
9 Product_009 8.98 2 17.96
10 Product_010 9.18 3 27.54
11 Product_011 8.31 4 33.24
12 Product_012 7.29 9 65.61
13 Product_013 8.36 5 41.80
14 Product_014 9.16 6 54.96
15 Product_015 10.31 3 30.93
16 Product_016 10.26 6 61.56
17 Product_017 11.95 8 95.60
18 Product_018 11.22 2 22.44
19 Product_019 10.95 4 43.80
20 Product_020 8.82 6 52.92
--------------------
Name OnePrice Count Price
ID
1 Product_001 9.82 5 49.10
2 Product_002 11.99 4 47.96
3 Product_003 9.62 6 57.72
4 Product_004 11.08 8 88.64
5 Product_005 7.75 3 23.25
6 Product_006 7.34 4 29.36
7 Product_007 10.97 6 65.82
8 Product_008 11.14 7 77.98
9 Product_009 8.98 2 17.96
10 Product_010 9.18 3 27.54
11 Product_011 8.31 4 33.24
12 Product_012 7.29 9 65.61
13 Product_013 8.36 5 41.80
14 Product_014 9.16 6 54.96
15 Product_015 10.31 3 30.93
16 Product_016 10.26 6 61.56
17 Product_017 11.95 8 95.60
18 Product_018 11.22 2 22.44
19 Product_019 10.95 4 43.80
20 Product_020 8.82 6 52.92
原始数据:
修改后:
排序
import pandas as pd
books = pd.read_excel('~/Desktop/05List.xlsx')
print(books)
print("-"*20)
# 将价钱进行排列
'''
by='Price' : 跟据哪一列进行排序
inplace=True : 是否在原有的DataFrame上修改,
ascending = True : 默认为升序
'''
books.sort_values(by='Price',inplace=True,ascending = False)
# 将买过的书按价格高低排列
'''
by=['Buy','Price'] :先排Buy,在这个基础上排Price
ascending = [True,False]: 分别指定buy和price的升降序
'''
books.sort_values(by=['Buy','Price'],inplace=True,ascending = [True,False])
print(books)
输出:
ID Name Price Buy
0 1 Product_001 9.82 Yes
1 2 Product_002 11.99 Yes
2 3 Product_003 9.62 No
3 4 Product_004 11.08 Yes
4 5 Product_005 7.75 Yes
5 6 Product_006 7.34 No
6 7 Product_007 10.97 Yes
7 8 Product_008 11.14 Yes
8 9 Product_009 8.98 No
9 10 Product_010 9.18 Yes
10 11 Product_011 8.31 Yes
11 12 Product_012 7.29 No
12 13 Product_013 8.36 Yes
13 14 Product_014 9.16 Yes
14 15 Product_015 10.31 No
15 16 Product_016 10.26 Yes
16 17 Product_017 11.95 Yes
17 18 Product_018 11.22 No
18 19 Product_019 10.95 Yes
19 20 Product_020 8.82 Yes
--------------------
ID Name Price Buy
17 18 Product_018 11.22 No
14 15 Product_015 10.31 No
2 3 Product_003 9.62 No
8 9 Product_009 8.98 No
5 6 Product_006 7.34 No
11 12 Product_012 7.29 No
1 2 Product_002 11.99 Yes
16 17 Product_017 11.95 Yes
7 8 Product_008 11.14 Yes
3 4 Product_004 11.08 Yes
6 7 Product_007 10.97 Yes
18 19 Product_019 10.95 Yes
15 16 Product_016 10.26 Yes
0 1 Product_001 9.82 Yes
9 10 Product_010 9.18 Yes
13 14 Product_014 9.16 Yes
19 20 Product_020 8.82 Yes
12 13 Product_013 8.36 Yes
10 11 Product_011 8.31 Yes
4 5 Product_005 7.75 Yes
数据库连接
import pandas as pd
import pymysql
conn=pymysql.connect(host='localhost',user='root',passwd='123',db='tieba',port=3306,charset='utf8mb4')
query = 'SELECT id,info,url FROM tieba_info'
df = pd.read_sql_query(query,conn)
print(df)
赠人玫瑰,手有余香,如果我的文章有幸能够帮到你,麻烦帮忙点下右下角的推荐,谢谢!
作者: imcati
出处: https://www.cnblogs.com/imcati/>
本文版权归作者所有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出, 原文链接