Series对象与DataFrame对象
import pandas as pd import numpy as np #创建Series对象的方法 # #指定index,可以不按顺序,不连续:pd.Series(data, index=index) x = pd.Series([1,2,3,4], index=[3,4,5,6]) print(x) #data可以为标量,类似广播 x = pd.Series("Hanks" ,index = [1,2,4,5]) print(x) #data可以是字典,index默认是排序的字典键,series对象仅仅保留index定义的key-value对 x = pd.Series({3:'c',2:'b',1:'a'},index=[2,3]) print(x) #创建DataFrame对象的方法 population = {'henan':1000,'shandong':200,'hubei':400} area = {'henan':98,'shandong':900,'hubei':4000} population = pd.Series(population)#dataframe里的对象必须是series province = pd.DataFrame({'population':population,'area':area}) print(province) print(province['area'])#与一般多维数组不同,该操作返回的是一列 #二维数组建立dataframe对象 abc = pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c']) print(abc) #index:不可变数组 not mutable x = pd.Index([3,2,5,9]) x[4] = 5#该句子报错:Index does not support mutable operations print(x) y = pd.Index([4,6,9,23,3]) print(x & y)#交集 print(x | y)#并集 print(x ^ y)#差集 #索引器:loc,iloc和ix data = pd.Series(['a','b','c'] , index=[1,3,5]) print(data[3])#显式索引 print(data[1:3])#隐式索引 print(data.loc[1:3])#显式索引 print(data.iloc[1:3])#隐式索引 #dataframe的取值方法 area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}) pop = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}) data = pd.DataFrame({'area':area, 'pop':pop}) print(data.area,data.pop) # #属性获取方法并不总是有用,当列名与方法名重合或者不全为字符串时,不可使用属性方法 print(data.area is data['area']) data['density'] = data['pop']/data['area'] print(data.values) print(data.T) #iloc print(data.iloc[:3,:2]) #loc print(data.loc[:'Illinois',:'pop']) #ix:混合,该功能已经被移除 print(data.ix[:3,:'pop']) x = np.random.RandomState(43) print(x) #pandas计算:一元运算保留索引和列标签;二元计算自动对其索引进行计算 area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area') population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population') print(area/population)#索引对齐 x = area/population A = np.random.randint(10,size=(3,4)) print(A) print(A - A[0]) print(x.isnull()) print(x[x.notnull()]) print(x.dropna()) print(x) df = pd.DataFrame([[1, np.nan, 2] ,[2, 3 , 5] ,[np.nan , 4 ,6]]) print(df.dropna()) print(df.dropna(axis='columns')) df[3] = np.nan print(df) print(df.dropna(axis='columns' , how='all')) print(df.dropna(axis='rows' , thresh=3)) #全局填充 print(df.fillna(9999)) #前值填充 print(df.fillna(method='ffill',axis=1)) #后值填充 print(df.fillna(method='bfill',axis=1))