1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Feb 15 20:36:15 2017
  4. @author: zzpp220
  5. """
  6. import json,sys
  7. from pandas import DataFrame,Series
  8. import pandas as pd
  9. import numpy as np
  10. #=======================dataset1-web用户数据=======================================================
  11. # path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
  12. # records=[json.loads(lines) for lines in open(path)]##json可将java脚本的格式化成字典对象
  13. #
  14. # frame=DataFrame(records)
  15. # ####==============================================================================
  16. # tz=frame['tz']##只取出该列的所有值,或者tz=frame.tz
  17. # # ##frame['tz']返回的col_tz(series)有 一个value_counts的方法, return series 统计出tz中想同内容有多少个
  18. # # col_tz=frame['tz'].value_counts() ##返回只有一个元素的元祖对象
  19. # # subna_tz=tz.fillna('Missing')## repalce 'nan' with 'Missing'
  20. # # subna_tz[subna_tz=='']='Unknown'##!!!查找空白并且替换空白的方法
  21. # # col_tz_1=subna_tz.value_counts()
  22. # # col_tz_1[:10].plot(kind='barh',rot=0)
  23. # #==============================================================================
  24. # a=frame.a.dropna()##去掉‘a’字段中的nan值
  25. # tmp2=[x.split()[0] for x in a]##取'a'字段中每条记录的第一个空格前的内容
  26. # result=Series(tmp2)
  27. # #print result[:10]
  28. # count_result=result.value_counts()
  29. # ##将a字段的非nan的记录按照Windows与否的进行分解
  30. # notnull_a=frame.a.notnull()
  31. # frame_notnulla=frame[notnull_a]
  32. # condition=frame_notnulla['a'].str.contains('Windows')
  33. # operation_system=np.where(condition,'Windows','Not Windows')
  34. # ##根据'tz'字段和新得到的操作系统对数据表进行分组:
  35. # #==============================================================================
  36. # # tz[tz=='']='Unknown'
  37. # # gru_tz_os=frame_notnulla.groupby([tz,operation_system])
  38. # # agg_counts=gru_tz_os.size().unstack().fillna(0)
  39. # #==============================================================================
  40. # gru_tz_os=frame_notnulla.groupby(['tz',operation_system])
  41. # agg_counts=gru_tz_os.size().unstack().fillna(0)
  42. # indexer=agg_counts.sum(1)##对每行的条目进行求和
  43. # sort_index=indexer.argsort()##给每个条目按照总和的高低进行排序,给出索引号
  44. #
  45. # sub_sort_index=agg_counts.take(sort_index)[-10:]###按照上面索引的顺序截取分组表的后10行记录
  46. # sub_sort_index.plot(kind='barh',stacked=True)##堆积
  47. # ##归一化,改进优化比例显示
  48. # normed=sub_sort_index.div(sub_sort_index.sum(1),axis=0)
  49. # normed.plot(kind='barh',stacked=True)
  50. #==============================================================================
  51. #==================dataset2-电影评分============================================================
  52. #
  53. # dir_path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/movielens/'
  54. #
  55. # #==================将每个表分别读到一个DataFrame的对象中============================================================
  56. # unames=['user_id','gender','age','occupation','zip_code']##表中的各个字段
  57. # users=pd.read_table(dir_path+'users.dat',sep='::',header=None,names=unames)
  58. #  
  59. # mnames=['movie_id','title','genres']
  60. # movies=pd.read_table(dir_path+'movies.dat',sep='::',header=None,names=mnames)
  61. #  
  62. # rnames=['user_id','movie_id','rating','timestamp']
  63. # ratings=pd.read_table(dir_path+'ratings.dat',sep='::',header=None,names=rnames)
  64. # #==============================================================================
  65. #
  66. # ##==================依次连接多个DataFrame的对象=======================
  67. # whole=pd.merge(pd.merge(users,ratings),movies)##表的顺序就是总表中字段的排列顺序,相同的字段名会自动判断
  68. # ##whole.ix[0] 这是每天记录的查看方法,不能直接whole[0]
  69. # ##按性别计算每部电影的平均分,所以要选择的字段为'title','gender'其中名字作为行名,性别做为字段名
  70. # avg_ratings=whole.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
  71. #
  72. #
  73. # ##先对title进行分组,然后利用size()得到一个包含各电影分组大小的Series对象--rating_by_title
  74. # ##count_movie=whole.title.value_counts()--作用同下
  75. # rating_by_title=whole.groupby('title').size()##直接whole.groupby('title')不会有任何输出
  76. # needed_movie=rating_by_title.index[rating_by_title>=250]##筛选出评分数大于250的电影名
  77. # avg_rating_needed=avg_ratings.ix[needed_movie]
  78. #
  79. # desc_feamale_rating=avg_rating_needed.sort_index(by='F',ascending=False)##按女性的喜爱程度降序排列
  80. #
  81. # ##计算男女分歧最大的电影
  82. # avg_rating_needed['diff']=avg_rating_needed['M']-avg_rating_needed['F']
  83. # sort_by_diff=avg_rating_needed.sort_index(by='diff')
  84. #
  85. # ##
  86. # #========单纯计算分歧最大的电影,计算得分数据的方差或者标准差:对whole按电影名分类======
  87. # ##按照电影名分类,对每个电影类内的rating列进行求平均差
  88. # rating_std_title=whole.groupby('title')['rating'].std()
  89. # ##从计算后的列表中选出评分记录大于250的电影名
  90. # rating_std_title=rating_std_title.ix[needed_movie]
  91. # ##按降序排列,第一即为方差最大的电影,即为分歧最大的电影
  92. # rating_std_title.order(ascending=False)[:10]
  93. # #==============================================================================
  94. #
  95. #
  96. #================dataset3-婴儿姓名==============================================================
  97. #
  98. path_name='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/names/'
  99. columns=['name','gender','births']
  100. #names1880=pd.read_csv(path_name+'yob1880.txt',names=['name','gender','births'])##birth表示当年出生使用该名字的次数
  101. #birth_sum=names1880.groupby('gender').births.sum()#按'gender'列分组,在每组内,对births列求和
  102. #################将所有年度的表的信息汇总到一个总表,并加上列year
  103. years=range(1880,2011)
  104. piece=[]
  105. for year in years:
  106.    sep_path=path_name+'yob%d.txt' % year
  107.    sep_frame=pd.read_csv(sep_path,names=columns)
  108.    sep_frame['year']=year##为每个表添加字段-年,并赋值为当年
  109.    piece.append(sep_frame)
  110. names=pd.concat(piece,ignore_index=True)##!!!默认按行组合多个
  111. ##按照性别(column,字段名)对每年(index行名)的births(首参数)进行求和(aggfunc)
  112. total_births=names.pivot_table('births',index='year',columns='gender',aggfunc=sum)
  113. total_births.plot(title='Total births by gneder and year')
  114. #groupby_total_births=names.groupby(['year','gender']).births.sum()
  115. #groupby_total_births.plot(title='Total births by gneder and year')


posted on 2017-02-28 22:17  zx0801  阅读(384)  评论(0编辑  收藏  举报