Pandas 学习笔记
一、用Pandas创建Excel文件
import pandas as pd
df = pd.DataFrame({'ID':[1,2,3],'Name':['liujun','daifen','duanziqian']}) # 创建一个数据集
df = df.set_index('ID') # 将ID设为索引
df.set_index('ID',inplace = True) # 将ID设为索引的第二种方法
df.to_excel('E:/img/test.xlsx') # 保存为本地Excel
print(df)
print("Done!")
二、数据读取
import pandas as pd
# header 用来设置把那一行当做列属性行 ,但是如果前面行为空,读取时能够跳过空行,如果文件本身没有列索引,可以将Header = None
# Header = None 后可以人为设定 列属性
# test.columns = ['姓名','年级','',]
test = pd.read_excel('E:/img/test.xlsx',header=0,index_col='姓名')# index_col='姓名' 用来指明索引列
print(test.shape) # 输出表格大小
print(test.columns) # 输出列属性
print(test.head(3)) # 默认输出前 5 行
print("================================")
print(test.tail(3)) # 默认输出后 3 行
三、操纵行列,单元格
生成序列
import pandas as pd
L1 = [100,200,300]
L2 = ['x','y','z']
s1 = pd.Series(L1,index=L2)
# 或者直接 s1 = pd.Series([100,200,300],index=['x','y','z'])
print(s1)
"""
d = {'x':100,'y':200,'z':300} # 键值对,字典
s1 = pd.Series(d) # 生成一个序列对象
print(s1)
"""
把序列加入表格,以列的形式
import pandas as pd
s1 = pd.Series([1,2,3],index=[1,2,3],name='A')
s2 = pd.Series([10,20,30],index=[1,2,3],name='B')
s3 = pd.Series([100,200,300],index=[1,2,3],name='C')
df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3}) # 把s1,s2,s3当做列加入表格
print(df)
四、自动填充,读取跳过空行空列
拿到 Series 然后改单元格
import pandas as pd
from datetime import date,timedelta
def add_month(d,md):
yd = md // 12
m = d.month + md % 12
if m != 12:
yd += m//12
m=m%12
return date(d.year+yd,m,d.day)
# skiprows 跳过指定多少行,usecols:只读取指定列
books = pd.read_excel('E:/img/book.xlsx',skiprows=3,usecols="C:F",index_col=None,dtype={'ID':str,'Instore':str,'Date':str})
start = date(2018,1,1) # 注意是小括号
for i in books.index:
books['ID'].at[i] = i+1
if i%2 == 0:
books['Instore'].at[i] = 'Yes'
else:
books['Instore'].at[i] = 'No'
# books['Date'].at[i] = start + timedelta(days=i) # 加日期
# books['Date'].at[i] = date(start.year+i,start.month,start.day) # 加年份
books['Date'].at[i] = add_month(start,i) # 加月份
print(books)
直接在 DateFrame 中改
books[i,'ID'] = i+1
books[i,'Instore'] = 'No'
books[i,'Date'] = add_month(start,i)
五、列乘法
直接 一列 乘 一列,它自动会逐项把第一列的元素乘以第二列的元素
import pandas as pd
books = pd.read_excel('E:/img/books.xlsx',index_col='ID')
books['Price'] = books['ListPrice'] * books['Discount']
print(books)
加减乘除同样的处理思维
只算某几行,可以用循环
import pandas as pd
books = pd.read_excel('E:/img/books.xlsx',index_col='ID')
for i in range(5,16):
books['Price'].at[i] = books['ListPrice'].at[i]*books['Discount'].at[i]
print(books)
调用 apply() 函数操作
import pandas as pd
def add_2(x):
return x+2
books = pd.read_excel('E:/img/books.xlsx',index_col='ID')
books['ListPrice'] = books['ListPrice'].apply(add_2) #只需要传函数名字
print(books)
P6