election人口竞选分析(pandas)
美国2012年总统候选人政治献金数据分析
导入包
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
方便大家操作,将月份和参选人以及所在政党进行定义
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
读取文件
data = pd.read_csv('./data/usa_election.txt')
data.head()
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
#新建一列各个候选人所在党派party
data['party'] = data['cand_nm'].map(parties)
data.head()
#party这一列中有哪些元素
data['party'].unique()
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
#统计party列中各个元素出现次数,value_counts()是Series中的,无参,返回一个带有每个元素出现次数的Series
data['party'].value_counts()
Democrat 292400
Republican 237575
Reform 5364
Libertarian 702
Name: party, dtype: int64
#查看各个党派收到的政治献金总数contb_receipt_amt
data.groupby(by='party',axis=0)['contb_receipt_amt'].sum()
party
Democrat 8.105758e+07
Libertarian 4.132769e+05
Reform 3.390338e+05
Republican 1.192255e+08
Name: contb_receipt_amt, dtype: float64
#查看具体每天各个党派收到的政治献金总数contb_receipt_amt
data.groupby(by=['party','contb_receipt_dt'],axis=0)['contb_receipt_amt'].sum()
party contb_receipt_dt
Democrat 01-AUG-11 175281.00
01-DEC-11 651532.82
01-JAN-12 58098.80
01-JUL-11 165961.00
01-JUN-11 145459.00
01-MAY-11 82644.00
01-NOV-11 122529.87
01-OCT-11 148977.00
01-SEP-11 403297.62
02-AUG-11 164510.11
02-DEC-11 216056.96
02-JAN-12 89743.60
02-JUL-11 17105.00
02-JUN-11 422453.00
02-MAY-11 396675.00
02-NOV-11 147183.81
02-OCT-11 62605.62
02-SEP-11 137948.41
03-AUG-11 147053.02
03-DEC-11 81304.02
03-JAN-12 87406.97
03-JUL-11 5982.00
03-JUN-11 320176.20
03-MAY-11 261819.11
03-NOV-11 119304.56
03-OCT-11 363061.02
03-SEP-11 45598.00
04-APR-11 640235.12
04-AUG-11 598784.23
04-DEC-11 72795.10
...
Republican 29-AUG-11 941769.23
29-DEC-11 428501.42
29-JAN-11 750.00
29-JAN-12 75220.02
29-JUL-11 233423.35
29-JUN-11 1340704.29
29-MAR-11 38875.00
29-MAY-11 8363.20
29-NOV-11 407322.64
29-OCT-11 81924.01
29-SEP-11 1612794.52
30-APR-11 43004.80
30-AUG-11 915548.58
30-DEC-11 492470.45
30-JAN-12 255204.80
30-JUL-11 12249.04
30-JUN-11 2744932.63
30-MAR-11 50240.00
30-MAY-11 17803.60
30-NOV-11 809014.83
30-OCT-11 43913.16
30-SEP-11 4886331.76
31-AUG-11 1017735.02
31-DEC-11 1094376.72
31-JAN-11 6000.00
31-JAN-12 869890.41
31-JUL-11 12781.02
31-MAR-11 62475.00
31-MAY-11 301339.80
31-OCT-11 734601.83
Name: contb_receipt_amt, Length: 1183, dtype: float64
def transform_date(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
#将表中日期格式转换为'yyyy-mm-dd'。
# date = data['contb_receipt_dt'].apply(transform_date)
date = data['contb_receipt_dt'].map(transform_date)
data['contb_receipt_dt'] = date
data.head()
#查看老兵(捐献者职业)DISABLED VETERAN主要支持谁 :查看老兵们捐赠给谁的钱最多
#先从原数据中将老兵这个职业对应的行数据取出
data['contbr_occupation'] == 'DISABLED VETERAN'
old_bing_df = data.loc[data['contbr_occupation'] == 'DISABLED VETERAN']
old_bing_df.head()
#对竟选者进行分组
old_bing_df.groupby(by='cand_nm',axis=0)['contb_receipt_amt'].sum()
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
data['contb_receipt_amt'].max()
#找出候选人的捐赠者中,捐赠金额最大的人的职业以及捐献额 .通过query("查询条件来查找捐献人职业")
data.query('contb_receipt_amt == %f'%data['contb_receipt_amt'].max())