数据分析——pandas
简介
1 import pandas as pd 2 3 # 在数据挖掘前一个数据分析、筛选、清理的多功能工具 4 ''' 5 pandas 可以读入excel、csv等文件;可以创建Series序列,DataFrame表格,日期数组data_range 6 '''
数据类型
1 # 将excel文件,csv文件读取并转换为pandas的DataFrame 2 # df_score = pd.read_csv() 3 df_score = pd.read_excel('./score.xlsx') 4 # df_score.values #数据 5 # df_score.columns #列名 6 # print df_score.describe() #计算表的各项数据,count,mean,std,中位数等 7 8 # 创建一个默认索引从0开始的Series 9 s = pd.Series([1, 2, 3, 4, 5, 6]) 10 # 创建自定义索引的数组,索引由index指定,和前面数组依次对应 11 s = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'], dtype=int) 12 # 使用字典创建一个DataFrame,字典的Key会自动成为列名,一个Key默认对应一列数据 13 df1 = pd.DataFrame({'math': [1, 2, 3, 4, 5], 'physic': [5, 6, 7, 8, 9]}, index=['a', 'b', 'c', 'd', 'e']) 14 ''' 15 # df1.values 数据 16 # df1.head(2) 前两行数据 17 # df1.tail(2) 最后两行数据 18 # df1.index 索引 19 # df1.columns 列名 20 ''' 21 # 生成从20180101开始的时间序列,peroids是增加量,默认增加单位是天D,H小时,s秒 22 dates = pd.date_range('20180101', periods=10, freq='D') 23 # 创建使用时间索引的Series 24 # s = pd.Series(range(10),index=dates) 25 # 取出指定间隔的行数据 26 # s['2018-01-01':'2018-01-05'] 27 # print dates
票房分析
1 df_imdb = pd.read_csv('./IMDB.csv') 2 3 # print df_imdb 4 # print df_imdb.columns 5 # df_imdb['Title'].head(5) #选出Title列的前五行 6 # df_imdb['Title'].tail(3) 7 # df_imdb.Title.head(3) #同[]的形式 8 # df_imdb['Revenue (Millions)'].max() #最大票房 9 # df_imdb['Revenue (Millions)'].idxmax() #最大票房的索引 10 # df_imdb[50:51] 11 # df_imdb[50:51]['Title'] 12 # df_imdb[50:51]['Revenue (Millions)'] #取出50行,不包括51行 13 # 取出50-56行,收尾都包含,第一维度是行,第二维度是列 14 # df_imdb.loc[50:56,['Director','Year']] 15 # df_imdb[50:56].loc[:,'Director','Year'] 16 # 取出1-5行(不包含第5行),2-4(不包含第4列)列的数据,使用整数索引操作,与numpy用法类似 17 # df_imdb.iloc[1:5,2:4] 18 # 统计Director列中不同导演出现的次数 19 # df_imdb['Director'].value_counts() 20 # 将票房大于5亿美元的电影选出来 21 # df_imdb[df_imdb['Revenue (Millions)']>500].Director 22 # df_imdb[df_imdb['Revenue (Millions)']>700]['Title'] 23 # 将电影风格描述中含有Sci-Fi(科幻) 关键字的找出 24 # df_imdb[df_imdb['Genre'].str.contains('Sci-Fi')] 25 26 # 将缺失数据(NaN)填充为0,也可以自己根据项目需求指定其他数据 27 # df_score.fillna(0) 28 # 将缺失数据的行移除(默认操作,可以使用axis=1指定删除列df_score.dropna(axis=1)) 29 # 0删除行,1删除列 30 # df_score.dropna() 31 # 在DataFram中增加一列平均值avg,计算当前DataFram中每行的平均值作为avg的数据 32 # 前后赋值数据的行数要对应,axis=1表示按行计算,axis=0(默认值),表示按列计算 33 # df_score['avg'] = df_score.mean(axis=1) 34 # 按照性别分组并求和指定成绩 35 # df_score.iloc[:,4:7].groupby(u'性别').sum() 36 # df_score.loc[:,[u'音乐',u'性别']].groupby(u'性别').sum() 37 # 按照男女分组并绘图,bar柱状图,pie饼状图 38 # df_score[u'性别'].value_counts().plot(kind='bar') 39 # df_score[u'性别'].value_counts().plot(kind='pie') 40 # & 数学大于80且化学大于60 41 # df_score[(df_score[u'数学']>80) &(df_score[u'化学']>60) ] 42 43 # 使用lambda,配合apply方法将日期中的指定年份或月份等提取出来 44 # apply函数会将lambda一次作用到数据集的每个元素 45 # datas = pd.Series(['20190901','20190902','20190903']) 46 # datas.apply(lambda x:x[0:4]) 47 # datas.apply(lambda x:x[4:6]) 48 49 # 创建一个数据的副本 50 # df_copy = df.copy() 51 # df_copy['R_Sum'] = df['SibSp']+df['Parch'] 52 53 # 计算数学列的总和、平均值等,里面的字符串必须有同名函数 54 # df[u'数学'].agg(['sum','mean','max','std']) 55 56 # pandas(Series、DataFrame)类型转换为numpy(array)类型 57 # df[u'数学'].values 58 # df.loc[:,[u'数学',u'化学']].values 59 60 # 按照指定列的值排序,可指定正序倒序,默认正序 61 # df[u'数学'].sort_values() 62 # 按照索引排序 63 # df[u'数学'].sort_index() 64 # df[u'数学'].sort_values(ascending=False) 65 # 添加新列sum,值为每行总和,并倒序排列 66 # df['sum'] = df.sum(axis=1) 67 # df[u'sum'].sort_values(ascending=False) 68 69 70 # 取出Embarked,Survived字段,按照两个字段顺序做层次分组,然后做计算总和 71 # r = df.loc[:,['Embarked','Survived']].groupby(['Embarked','Survived']).size() 72 # r.C 73 # r.C[1] 74 # r.Q 75 # r.Q[0] 76 # r.Q[1] 77 # r1 = df.loc[:,['Embarked','Survived']].groupby(['Survived','Embarked']).size() 78 # r2 = df.loc[:,['Embarked','Survived']].groupby('Embarked').size() 79 # r3 = df.loc[:,['Embarked','Survived']].groupby('Survived').size()
运行结果
""" 上面的运行结果 r Embarked Survived C 0 75 1 93 Q 0 47 1 30 S 0 427 1 217 dtype: int64 r.C结果 Survived 0 75 1 93 dtype: int64 r.C[1]结果 93 r1结果 Survived Embarked 0 C 75 Q 47 S 427 1 C 93 Q 30 S 217 dtype: int64 r2结果 Embarked C 168 Q 77 S 644 dtype: int64 r3结果 Survived 0 549 1 342 dtype: int64 """
标注:
''' 1.axis转换行列 2.DataFrame筛选一行或一列时会转化为Series类型,可以直接后面加[数字]直接进行选择,但Series不能使用DataFrame的方法(groupby等) 3.筛选出来的数据的索引仍是原索引,不会重新排列新索引 '''
统计拍片数前10的某导演,指导电影的总票房
1 def piaofang(): 2 director10 = df_imdb['Director'].value_counts().head(10) 3 # print director10.index[0] 4 revenues = 0 5 for d in director10.index: 6 print df_imdb[df_imdb['Director'] == d]['Revenue (Millions)'].sum() 7 8 # piaofang() 9 10 # df_imdb[df_imdb['Director']=='']['Revenue (Millions)'].sum()
票房分析
特征
''' PassengerId:乘客的唯一标志 Survived:1获救,0死亡 Pclass:座舱等级 3最好,1最差 Name,Sex,Age, SibSp:船上有没有兄弟姐妹 Parch:父母等直系亲属是否在船上 Ticket, Fare:票价或消费 Cabin:座舱号 Embarked:从哪个港口登船 891 '''
导入类库
1 import numpy as np 2 import matplotlib.pyplot as pt 3 import pandas as pd
准备数据
1 titanic = pd.read_csv('./Titanic.csv') 2 3 titanic.fillna(int(titanic[u'Age'].mean()))
测试代码
1 # print titanic['Age'] 2 3 # print titanic[u'Age'].mean() 4 # print titanic.loc[:,u'Survived'].value_counts() #存活比例 5 # print titanic.loc[:,u'Survived'].count() #总人数 6 7 # print titanic.loc[:, u'Sex'].value_counts() #男女分类 8 # print titanic[titanic[u'Sex'] == u'male']['Survived'].value_counts() #男性生死分类 9 10 # print titanic.columns 11 # print titanic[titanic[u'Age'] <= 18][u'Survived'].value_counts() 12 # print titanic[(titanic[u'Age'] > 18) & (titanic[u'Age'] < 60)][u'Survived'].value_counts() 13 # print titanic[titanic[u'Age'] >= 60][u'Survived'].value_counts() 14 15 # print titanic[u'Fare'] 16 # print titanic[u'Fare'].max() #贫富差距 17 # print titanic[u'Fare'].min() 18 19 # print titanic[u'Pclass'].value_counts() 20 # print titanic[u'Pclass'].value_counts()[1] 21 # print titanic[u'Pclass'].value_counts()[3] #座舱 22 # print titanic[titanic[u'Pclass'] == 1]['Survived'].value_counts() 23 # print titanic[titanic[u'Pclass'] == 3]['Survived'].value_counts() 24 25 # print titanic[u'SibSp'].value_counts() 26 # print titanic[u'Parch'].value_counts() 27 28 # print titanic[u'Embarked'].value_counts()
案例源码
1 class Titanic(object): 2 def __init__(self): 3 self.data = titanic 4 5 # 1.存活率是多少 6 def rate_survive(self): 7 survived = self.data.loc[:, 'Survived'].value_counts()[1] 8 death = self.data.loc[:, 'Survived'].value_counts()[0] 9 rate = float(survived) / (float(death) + float(survived)) 10 print '总人数:{},存活人数:{},死亡人数:{}'.format(survived + death, survived, death) 11 return u'存活率:' + '%.2f' % rate 12 13 # 2.哪个年龄段存活率最高 14 def max_survive(self): 15 age18_survived = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[1] 16 age18_death = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[0] 17 age18_rate = float(age18_survived) / (float(age18_survived) + float(age18_death)) 18 19 age1860_survived = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[1] 20 age1860_death = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[0] 21 age1860_rate = float(age1860_survived) / (float(age1860_survived) + float(age1860_death)) 22 23 age60_survived = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[1] 24 age60_death = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[0] 25 age60_rate = float(age60_survived) / (float(age60_survived) + float(age60_death)) 26 27 rate = [age18_rate, age60_rate, age1860_rate] 28 age_data = ['18岁以下', '18-60岁', '60岁以上'] 29 max_rate = max(rate) 30 age_range = age_data[rate.index(max(rate))] 31 return '存活率最高的年龄段是{},存活率为{}'.format(age_range, max_rate) 32 33 # 3.女性存活率是否高于男性 34 def than_survive(self): 35 male_survied = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[1] 36 male_death = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[0] 37 rate_male = float(male_survied) / (float(male_survied) + float(male_death)) 38 print '男性共有{}人,存活{}人,死亡{}人'.format(male_death + male_survied, male_survied, male_death) 39 female_survied = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[1] 40 female_death = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[0] 41 rate_female = float(female_survied) / (float(female_survied) + float(female_death)) 42 print '女性共有{}人,存活{}人,死亡{}人'.format(female_death + female_survied, female_survied, female_death) 43 if rate_male > rate_female: 44 return u'男性存活率更高,存活率为:%.2f' % rate_male 45 else: 46 return u'女性存活率更高,存活率为:%.2f' % rate_female 47 48 # 4.船上是否出现贫富差距 49 def poor_wealth(self): 50 max_wealth = self.data[u'Fare'].max() 51 max_poor = self.data[u'Fare'].min() 52 if max_wealth - max_poor > 500: 53 return '船上乘客最多消费了{},最少消费了{},存在贫富差距'.format(max_wealth, max_poor) 54 else: 55 return '船上乘客最多消费了{},最少消费了{},不存在贫富差距'.format(max_wealth, max_poor) 56 57 # 5.头等舱乘客的存活率是否高于经济舱 58 def pclass_survive(self): 59 pclass1_survived = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[1] 60 pclass1_death = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[0] 61 pclass1_rate = float(pclass1_survived) / (float(pclass1_survived) + float(pclass1_death)) 62 63 pclass3_survived = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[1] 64 pclass3_death = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[0] 65 pclass3_rate = float(pclass3_survived) / (float(pclass3_survived) + float(pclass3_death)) 66 67 if pclass3_rate > pclass1_rate: 68 return '头等舱乘客存活率更高,存活率为{}'.format(pclass3_rate) 69 else: 70 return '经济舱乘客存活率更高,存活率为{}'.format(pclass1_rate) 71 72 # 6.有亲属在船上乘客比率,有亲属是否会影响存活率 73 def family_survive(self): 74 has_family = self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'PassengerId'].count() 75 no_family = self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'PassengerId'].count() 76 rate_family = float(has_family) / (float(has_family) + float(no_family)) 77 78 has_family_survived = \ 79 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[1] 80 has_family_death = \ 81 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[0] 82 has_family_rate = float(has_family_survived) / (float(has_family_survived) + float(has_family_death)) 83 84 no_family_survived = \ 85 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[1] 86 no_family_death = \ 87 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[0] 88 no_family_rate = float(no_family_survived) / (float(no_family_survived) + float(no_family_death)) 89 90 print '船上乘客中有亲属也在船上的有{}人,无亲属在船上的有{}人,有亲属在船上的乘客的比率为{}'.format(has_family, no_family, rate_family) 91 if has_family_rate > no_family_rate: 92 return '有亲属在船上的乘客存活率更高,存活率为{}'.format(has_family_rate) 93 else: 94 return '无亲属在船上的乘客存活率更高,存活率为{}'.format(no_family_rate) 95 96 # 7.从哪个港口登船是否影响获救 97 def emarked_survive(self): 98 Embarked_S_survived = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[1] 99 Embarked_S_death = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[0] 100 Embarked_S_rate = float(Embarked_S_survived) / (float(Embarked_S_survived) + float(Embarked_S_death)) 101 102 Embarked_C_survived = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[1] 103 Embarked_C_death = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[0] 104 Embarked_C_rate = float(Embarked_C_survived) / (float(Embarked_C_survived) + float(Embarked_C_death)) 105 106 Embarked_Q_survived = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[1] 107 Embarked_Q_death = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[0] 108 Embarked_Q_rate = float(Embarked_Q_survived) / (float(Embarked_Q_survived) + float(Embarked_Q_death)) 109 110 embarked = ['S港口', 'C港口', 'Q港口'] 111 rate = [Embarked_S_rate, Embarked_C_rate, Embarked_Q_rate] 112 max_rate = max(rate) 113 return '{}存活率最大,为{}'.format(embarked[rate.index(max_rate)], max_rate) 114 115 # 8.不同年龄段女性的获救率 116 def female_survive(self): 117 female18_survived = \ 118 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 119 female18_death = \ 120 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 121 female18_rate = float(female18_survived) / (float(female18_survived) + float(female18_death)) 122 123 female1850_survived = \ 124 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 125 u'Survived'].value_counts()[1] 126 female1850_death = \ 127 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 128 u'Survived'].value_counts()[0] 129 female1850_rate = float(female1850_survived) / (float(female1850_survived) + float(female1850_death)) 130 131 female50_survived = \ 132 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 133 female50_death = \ 134 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 135 female50_rate = float(female50_survived) / (float(female50_survived) + float(female50_death)) 136 137 return '18岁以下女性存活率:{},18-50岁女性存活率:{},50岁以上女性存活率:{}'.format(female18_rate, female1850_rate, female50_rate) 138 139 140 if __name__ == '__main__': 141 tt = Titanic() 142 # print tt.rate_survive() 143 # print tt.than_survive() 144 # print tt.max_survive() 145 # print tt.poor_wealth() 146 # print tt.pclass_survive() 147 # print tt.family_survive() 148 # print tt.emarked_survive() 149 print tt.female_survive()
DATA-->INFOMATION-->KNOWLEDGE-->WISDOM
数据-->信息-->知识-->智慧
爬虫-->数据库-->数据分析-->机器学习
- 信息:通过某种方式组织和处理数据,分析数据间的关系,数据就有了意义
- 知识:如果说数据是一个事实的集合,从中可以得出关于事实的结论。那么知识(Knowledge)就是信息的集合,它使信息变得有用。知识是对信息的应用,是一个对信息判断和确认的过程,这个过程结合了经验、上下文、诠释和反省。知识可以回答“如何?”的问题,可以帮助我们建模和仿真
- 智慧:智慧可以简单的归纳为做正确判断和决定的能力,包括对知识的最佳使用。智慧可以回答“为什么”的问题。回到前面的例子,根据故障对客户的业务影响可以识别改进点