二十、数据可视化
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
MatPlotLib 中的双向条形图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
| first_name | pre_score | mid_score | post_score |
---|
0 | Jason | 4 | 25 | 5 |
1 | Molly | 24 | 94 | 43 |
2 | Tina | 31 | 57 | 23 |
3 | Jake | 2 | 62 | 23 |
4 | Amy | 3 | 70 | 51 |
x1 = df.ix[1, 1:]
x2 = df.ix[2, 1:]
bar_labels = ['Pre Score', 'Mid Score', 'Post Score']
fig = plt.figure(figsize=(8,6))
y_pos = np.arange(len(x1))
y_pos = [x for x in y_pos]
plt.yticks(y_pos, bar_labels, fontsize=10)
plt.barh(y_pos,
x1,
align='center',
alpha=0.4,
color='#263F13')
plt.barh(y_pos,
-x2,
align='center',
alpha=0.4,
color='#77A61D')
plt.xlabel('Tina\'s Score: Light Green. Molly\'s Score: Dark Green')
t = plt.title('Comparison of Molly and Tina\'s Score')
plt.ylim([-1,len(x1)+0.1])
plt.xlim([-max(x2)-10, max(x1)+10])
plt.grid()
plt.show()
MatPlotLib 中的条形图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
| first_name | pre_score | mid_score | post_score |
---|
0 | Jason | 4 | 25 | 5 |
1 | Molly | 24 | 94 | 43 |
2 | Tina | 31 | 57 | 23 |
3 | Jake | 2 | 62 | 23 |
4 | Amy | 3 | 70 | 51 |
mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()]
variance = [df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25]
bar_labels = ['Pre Score', 'Mid Score', 'Post Score']
x_pos = list(range(len(bar_labels)))
plt.bar(x_pos,
mean_values,
yerr=variance,
align='center',
color='#FFC222',
alpha=0.5)
plt.grid()
max_y = max(zip(mean_values, variance))
plt.ylim([0, (max_y[0] + max_y[1]) * 1.1])
plt.ylabel('Score')
plt.xticks(x_pos, bar_labels)
plt.title('Mean Scores For Each Test')
plt.show()
Seaborn 中的调色板
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41],
'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1],
'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75],
'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72],
'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5],
'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24],
'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2',
'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5',
'deaths_regiment_6', 'deaths_regiment_7'])
df = df.set_index(df.date)
sns.palplot(sns.color_palette("deep", 10))
sns.palplot(sns.color_palette("muted", 10))
sns.palplot(sns.color_palette("bright", 10))
sns.palplot(sns.color_palette("dark", 10))
sns.palplot(sns.color_palette("colorblind", 10))
sns.palplot(sns.color_palette("Paired", 10))
sns.palplot(sns.color_palette("BuGn", 10))
sns.palplot(sns.color_palette("GnBu", 10))
sns.palplot(sns.color_palette("OrRd", 10))
sns.palplot(sns.color_palette("PuBu", 10))
sns.palplot(sns.color_palette("YlGn", 10))
sns.palplot(sns.color_palette("YlGnBu", 10))
sns.palplot(sns.color_palette("YlOrBr", 10))
sns.palplot(sns.color_palette("YlOrRd", 10))
sns.palplot(sns.color_palette("BrBG", 10))
sns.palplot(sns.color_palette("PiYG", 10))
sns.palplot(sns.color_palette("PRGn", 10))
sns.palplot(sns.color_palette("PuOr", 10))
sns.palplot(sns.color_palette("RdBu", 10))
sns.palplot(sns.color_palette("RdGy", 10))
sns.palplot(sns.color_palette("RdYlBu", 10))
sns.palplot(sns.color_palette("RdYlGn", 10))
sns.palplot(sns.color_palette("Spectral", 10))
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(flatui)
sns.palplot(sns.color_palette())
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e")
使用 Seaborn 和 pandas 创建时间序列绘图
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41],
'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1],
'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75],
'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72],
'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5],
'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24],
'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2',
'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5',
'deaths_regiment_6', 'deaths_regiment_7'])
df = df.set_index(df.date)
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="indianred")
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False)
使用 Seaborn 创建散点图
import pandas as pd
%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame()
df['x'] = random.sample(range(1, 1000), 5)
df['y'] = random.sample(range(1, 1000), 5)
df['z'] = [1,0,0,1,0]
df['k'] = ['male','male','male','female','female']
df.head()
| x | y | z | k |
---|
0 | 466 | 948 | 1 | male |
1 | 832 | 481 | 0 | male |
2 | 978 | 465 | 0 | male |
3 | 510 | 206 | 1 | female |
4 | 848 | 357 | 0 | female |
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
sns.lmplot('x',
'y',
data=df,
fit_reg=False,
hue="z",
scatter_kws={"marker": "D",
"s": 100})
plt.title('Histogram of IQ')
plt.xlabel('Time')
plt.ylabel('Deaths')
MatPlotLib 中的分组条形图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
| first_name | pre_score | mid_score | post_score |
---|
0 | Jason | 4 | 25 | 5 |
1 | Molly | 24 | 94 | 43 |
2 | Tina | 31 | 57 | 23 |
3 | Jake | 2 | 62 | 23 |
4 | Amy | 3 | 70 | 51 |
pos = list(range(len(df['pre_score'])))
width = 0.25
fig, ax = plt.subplots(figsize=(10,5))
plt.bar(pos,
df['pre_score'],
width,
alpha=0.5,
color='#EE3224',
label=df['first_name'][0])
plt.bar([p + width for p in pos],
df['mid_score'],
width,
alpha=0.5,
color='#F78F1E',
label=df['first_name'][1])
plt.bar([p + width*2 for p in pos],
df['post_score'],
width,
alpha=0.5,
color='#FFC222',
label=df['first_name'][2])
ax.set_ylabel('Score')
ax.set_title('Test Subject Scores')
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(df['first_name'])
plt.xlim(min(pos)-width, max(pos)+width*4)
plt.ylim([0, max(df['pre_score'] + df['mid_score'] + df['post_score'])] )
plt.legend(['Pre Score', 'Mid Score', 'Post Score'], loc='upper left')
plt.grid()
plt.show()
MatPlotLib 中的直方图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://www.dropbox.com/s/52cb7kcflr8qm2u/5kings_battles_v1.csv?dl=1')
df.head()
| name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
---|
0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1 | 0 | 15000 | 4000 | Jaime Lannister | Clement Piper, Vance | 1 | Golden Tooth | The Westerlands | NaN |
1 | Battle at the Mummer’s Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1 | 0 | NaN | 120 | Gregor Clegane | Beric Dondarrion | 1 | Mummer’s Ford | The Riverlands | NaN |
2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0 | 1 | 15000 | 10000 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1 | Riverrun | The Riverlands | NaN |
— | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — | — |
3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H… | Tywin Lannister, Gregor Clegane, Kevan Lannist… | 1 | Green Fork | The Riverlands | NaN |
4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN |
data1 = df['attacker_size'][df['attacker_size'] < 90000]
data2 = df['defender_size'][df['attacker_size'] < 90000]
bins = np.arange(data1.min(), data2.max(), 2000)
plt.hist(data1,
bins=bins,
alpha=0.5,
color='#EDD834',
label='Attacker')
plt.hist(data2,
bins=bins,
alpha=0.5,
color='#887E43',
label='Defender')
plt.ylim([0, 10])
plt.title('Histogram of Attacker and Defender Size')
plt.xlabel('Number of troops')
plt.ylabel('Number of battles')
plt.legend(loc='upper right')
plt.show()
data1 = df['attacker_size'][df['attacker_size'] < 90000]
data2 = df['defender_size'][df['attacker_size'] < 90000]
bins = np.linspace(min(data1 + data2),
max(data1 + data2),
10)
plt.hist(data1,
bins=bins,
alpha=0.5,
color='#EDD834',
label='Attacker')
plt.hist(data2,
bins=bins,
alpha=0.5,
color='#887E43',
label='Defender')
plt.ylim([0, 10])
plt.title('Histogram of Attacker and Defender Size')
plt.xlabel('Number of troops')
plt.ylabel('Number of battles')
plt.legend(loc='upper right')
plt.show()
从 Pandas 数据帧生成 MatPlotLib 散点图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'female': [0, 1, 1, 0, 1],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore'])
df
| first_name | last_name | age | female | preTestScore | postTestScore |
---|
0 | Jason | Miller | 42 | 0 | 4 | 25 |
1 | Molly | Jacobson | 52 | 1 | 24 | 94 |
2 | Tina | Ali | 36 | 1 | 31 | 57 |
3 | Jake | Milner | 24 | 0 | 2 | 62 |
4 | Amy | Cooze | 73 | 1 | 3 | 70 |
plt.scatter(df.preTestScore, df.postTestScore
, s=df.age)
plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
Matplotlib 的简单示例
%matplotlib inline
import matplotlib.pyplot as pyplot
pyplot.plot([1.6, 2.7])
MatPlotLib 中的饼图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
raw_data = {'officer_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'jan_arrests': [4, 24, 31, 2, 3],
'feb_arrests': [25, 94, 57, 62, 70],
'march_arrests': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests'])
df
| officer_name | jan_arrests | feb_arrests | march_arrests |
---|
0 | Jason | 4 | 25 | 5 |
1 | Molly | 24 | 94 | 43 |
2 | Tina | 31 | 57 | 23 |
3 | Jake | 2 | 62 | 23 |
4 | Amy | 3 | 70 | 51 |
df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests']
df
| officer_name | jan_arrests | feb_arrests | march_arrests | total_arrests |
---|
0 | Jason | 4 | 25 | 5 | 34 |
1 | Molly | 24 | 94 | 43 | 161 |
2 | Tina | 31 | 57 | 23 | 111 |
3 | Jake | 2 | 62 | 23 | 87 |
4 | Amy | 3 | 70 | 51 | 124 |
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"]
plt.pie(
df['total_arrests'],
labels=df['officer_name'],
shadow=False,
colors=colors,
explode=(0, 0, 0, 0, 0.15),
startangle=90,
autopct='%1.1f%%',
)
plt.axis('equal')
plt.tight_layout()
plt.show()
MatPlotLib 中的散点图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv')
df.head()
| name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
---|
0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1.0 | 0.0 | 15000.0 | 4000.0 | Jaime Lannister | Clement Piper, Vance | 1.0 | Golden Tooth | The Westerlands | NaN |
1 | Battle at the Mummer’s Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1.0 | 0.0 | NaN | 120.0 | Gregor Clegane | Beric Dondarrion | 1.0 | Mummer’s Ford | The Riverlands | NaN |
2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0.0 | 1.0 | 15000.0 | 10000.0 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1.0 | Riverrun | The Riverlands | NaN |
3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H… | Tywin Lannister, Gregor Clegane, Kevan Lannist… | 1.0 | Green Fork | The Riverlands | NaN |
4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN |
plt.figure(figsize=(10,8))
plt.scatter(df['attacker_size'][df['year'] == 298],
df['defender_size'][df['year'] == 298],
marker='x',
color='b',
alpha=0.7,
s = 124,
label='Year 298')
plt.scatter(df['attacker_size'][df['year'] == 299],
df['defender_size'][df['year'] == 299],
marker='o',
color='r',
alpha=0.7,
s = 124,
label='Year 299')
plt.scatter(df['attacker_size'][df['year'] == 300],
df['defender_size'][df['year'] == 300],
marker='^',
color='g',
alpha=0.7,
s = 124,
label='Year 300')
plt.title('Battles Of The War Of The Five Kings')
plt.ylabel('Defender Size')
plt.xlabel('Attacker Size')
plt.legend(loc='upper right')
plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000])
plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000])
plt.show()
MatPlotLib 中的栈式百分比条形图
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
| first_name | pre_score | mid_score | post_score |
---|
0 | Jason | 4 | 25 | 5 |
1 | Molly | 24 | 94 | 43 |
2 | Tina | 31 | 57 | 23 |
3 | Jake | 2 | 62 | 23 |
4 | Amy | 3 | 70 | 51 |
f, ax = plt.subplots(1, figsize=(10,5))
bar_width = 1
bar_l = [i for i in range(len(df['pre_score']))]
tick_pos = [i+(bar_width/2) for i in bar_l]
totals = [i+j+k for i,j,k in zip(df['pre_score'], df['mid_score'], df['post_score'])]
pre_rel = [i / j * 100 for i,j in zip(df['pre_score'], totals)]
mid_rel = [i / j * 100 for i,j in zip(df['mid_score'], totals)]
post_rel = [i / j * 100 for i,j in zip(df['post_score'], totals)]
ax.bar(bar_l,
pre_rel,
label='Pre Score',
alpha=0.9,
color='#019600',
width=bar_width,
edgecolor='white'
)
ax.bar(bar_l,
mid_rel,
bottom=pre_rel,
label='Mid Score',
alpha=0.9,
color='#3C5F5A',
width=bar_width,
edgecolor='white'
)
ax.bar(bar_l,
post_rel,
bottom=[i+j for i,j in zip(pre_rel, mid_rel)],
label='Post Score',
alpha=0.9,
color='#219AD8',
width=bar_width,
edgecolor='white'
)
plt.xticks(tick_pos, df['first_name'])
ax.set_ylabel("Percentage")
ax.set_xlabel("")
plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])
plt.ylim(-10, 110)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()