pandas

之前学的:

# -*- coding: utf-8 -*-
"""
Created on Sat May 30 13:54:31 2020

@author: Administrator
"""
#Series   系列
import numpy as np
import pandas as pd
#用数组创建
s=pd.Series(np.arange(1,4),index=["A","B","C"])
s=pd.Series(np.arange(1,4),index=list("abc"))

print(s)
print(type(s))
#用列表创建
temp_dict={"name":"吴森","age":"24","性格":"二比"}
print(pd.Series(temp_dict))

print(s)
print(s["a"])
print(s[0])
print(s[2:])
print(s[[0,1]])

print(s[s>2])

print(s.index)
print(s.values)



#读取文件
import pandas as pd
f=pd.read_csv("f://pa.csv")
#pd.read_sql(sql_sentence,connection)
print(f["a"])




#dataframe
import numpy as np
import pandas as pd
 
dates=pd.date_range('2019-08-01',periods=6)
pd=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['A','B','C','D'])
print('输出6行4列的表格:')
print(pd)
print('\n')
 
print('输出第二列:')
print(pd['B'])
print('\n')
 
----------执行以上程序,返回的结果为----------

输出6行4列的表格:                   A         B         C         D2019-08-01  0.796050 -0.383286 -1.465294 -0.2723212019-08-02 -1.431981 -0.875381  1.371449  0.3217032019-08-03 -1.497636  1.258925 -1.374210 -0.7656262019-08-04  2.518305  0.125094  2.647512 -0.0247482019-08-05 -0.319238  0.395384 -0.582052 -0.3961322019-08-06 -0.519434  1.873216  1.685524 -1.493000
输出第二列:2019-08-01   -0.3832862019-08-02   -0.8753812019-08-03    1.2589252019-08-04    0.1250942019-08-05    0.3953842019-08-06    1.873216Freq: D, Name: B, dtype: float64
 
-------------------------------------------









import numpy as np
import pandas as pd
from datetime import datetime as dt
 
print('通过字典创建DataFrame:')
df_1=pd.DataFrame({'A':1.0,
'B':pd.Timestamp(2019,8,19),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'})
print(df_1)
print('\n')
 
print('返回每列的数据类型:')
print(df_1.dtypes)
print('\n')
 
print('返回行的序号:')
print(df_1.index)
print('\n')
 
print('返回列的序号名字:')
print(df_1.columns)
print('\n')
 
print('把每个值进行打印出来:')
print(df_1.values)
print('\n')
 
print('数字总结:')
print(type(df_1.describe()))
print(df_1.describe())
print('\n')
 
print('翻转数据:')
print(df_1.T)
print('\n')
 
print('按第一列进行排序:')
#axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示
print(df_1.sort_index(1,ascending=False))
print('\n')
 
print('按某列的值进行排序:')
print(df_1.sort_values('E'))
print('\n')
 df1.index = Series(['beijing', 'shanghai', 'guangzhou'])#修改索引
f.set_index("class3",inplace=True)#让某一列替换成索引
pd.loc(1)#
pd.iloc(1)#下标
df.append(df2, ignore_index=True)#添加一行
data.insert(0,'d',[1,2])#加一列
f["class4"]=[11,12,13]#加一列
处理缺失值
dropna()
fillna()
替换为缺失值
replace("?",np.nan)



#统计类别
import numpy as np
import pandas as pd
f=pd.read_csv("f://dianying.csv")
print(f["class"])
print(f)
a=f["class"].str.split(",").tolist()
print(a)

b=list(set([i for j in a for i in j]))
print(b)
m_Data=pd.DataFrame(np.zeros((4,3)),columns=b)
for i in range(f.shape[0]):
    m_Data.loc[i,a[i]]=1
m_Data.astype("int")
print(m_Data)
print(f)

#按照行 合并
f.join(m_Data)

df.head(10)


#交集,谁交谁结果都一样
d1=pd.DataFrame(np.arange(11,20).reshape((3,3)),columns=list("abc"))
d2=pd.DataFrame(np.arange(1,10).reshape((3,3)),columns=list("ade"))
print(d1.loc[0,"a"])
d1.loc[0,"a"]=1
print(d1.head(0))#输出n行
print(d1.info())
d1.merge(d2,on="a")
d2.merge(d1,on="a")
d2.merge(d1,on="a",how="outer")
d2.merge(d1,on="a",how="left")#左连接
d2.merge(d1,on="a",how="right")#右连接
#如果没有相同的列    on="a"   ==   left_on="",right_on=""
d3=pd.DataFrame(np.arange(1,5).reshape((2,2)),columns=list("ab"))
d4=pd.DataFrame(np.arange(1,5).reshape((2,2)),columns=list("ac"))
d3.loc[1,"a"]=1
d4.loc[1,"a"]=1
print(d3)
print(d4)
d3.merge(d4,on="a")
d4.merge(d3,on="a")#默认how="inner"内连接

d3.merge(d4,on="a",how="outer")#外连接,相当于A+B-AnB   
pd.merge(d1,d2,on=["id1","id2"])






import numpy as np
import pandas as pd
f=pd.read_csv("f://dianying.csv")
print(f["class"])
print(f.groupby(by="class"))
print(type(f.groupby(by="class")))
#取其中一列
g=f.groupby(by="class")
for i,j in g:#i是值,j是全部内容
    print(i)
    print("-"*40)
f.count()
f["class"].count()   
f.groupby(by="class").count()
f.groupby(by="class")["class"].count()
f.groupby(by="class")["class"].count()["b,c"]
#计算非空聚合
count()
sum()
mean()
median()
std() var()
min() max()

f.groupby(by="class")#可以写字符串,也可以写数组聚合
f.groupby(by=[f["class3"],f["class2"]])["class2"].count()["a"].count()



#DataFrame 时间序列
import pandas as pd
import numpy as np
#两种用法
pd.date_range(start="20190101",end="20200301",freq="M")
pd.date_range(start="20190101",periods=10,freq="10D")
pd.date_range(start="20190101",periods=10,freq="10H")

pd.to_datetime("2020-11-01",format="")#format一般不需要写,一般处理中文
d["class"]=pd.to_datetime(d["class"],format="")#format一般不需要写,一般处理中文

t=pd.DataFrame(np.arange(1,101),index=pd.date_range(start="20170101",periods=100,freq="D"))
t.resample("M").mean()
t.resample("M").count()


#让某一列替换成索引
import numpy as np
import pandas as pd
f=pd.read_csv("f://dianying.csv")
f.set_index("class3",inplace=True)#让某一列替换成索引
f["class4"]=[11,12,13]
View Code

 

posted @ 2020-10-10 20:48    阅读(59)  评论(0编辑  收藏  举报