#1
import pandas as pd
#2
pd.__version__
'1.0.5'
#3
pd.show_versions()
INSTALLED VERSIONS
------------------
commit : None
python : 3.8.3.final.0
python-bits : 64
OS : Darwin
OS-release : 19.6.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : zh_CN.UTF-8
LOCALE : zh_CN.UTF-8
pandas : 1.0.5
numpy : 1.18.5
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 49.2.0.post20200714
Cython : 0.29.21
pytest : 5.4.3
hypothesis : None
sphinx : 3.1.2
blosc : None
feather : None
xlsxwriter : 1.2.9
lxml.etree : 4.5.2
html5lib : 1.1
pymysql : 0.10.1
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.16.1
pandas_datareader: None
bs4 : 4.9.1
bottleneck : 1.3.2
fastparquet : None
gcsfs : None
lxml.etree : 4.5.2
matplotlib : 3.2.2
numexpr : 2.7.1
odfpy : None
openpyxl : 3.0.4
pandas_gbq : None
pyarrow : None
pytables : None
pytest : 5.4.3
pyxlsb : None
s3fs : None
scipy : 1.5.0
sqlalchemy : 1.3.18
tables : 3.6.1
tabulate : None
xarray : None
xlrd : 1.2.0
xlwt : 1.3.0
xlsxwriter : 1.2.9
numba : 0.50.1
#4创建一个DataFrame(df),用data做数据,labels做行索引
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data,index=labels)
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
d |
dog |
NaN |
3 |
yes |
e |
dog |
5.0 |
2 |
no |
f |
cat |
2.0 |
3 |
no |
g |
snake |
4.5 |
1 |
no |
h |
cat |
NaN |
1 |
yes |
i |
dog |
7.0 |
2 |
no |
j |
dog |
3.0 |
1 |
no |
#5显示有关此df及其数据的基本信息的摘要
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 animal 10 non-null object
1 age 8 non-null float64
2 visits 10 non-null int64
3 priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
|
age |
visits |
count |
8.000000 |
10.000000 |
mean |
3.437500 |
1.900000 |
std |
2.007797 |
0.875595 |
min |
0.500000 |
1.000000 |
25% |
2.375000 |
1.000000 |
50% |
3.000000 |
2.000000 |
75% |
4.625000 |
2.750000 |
max |
7.000000 |
3.000000 |
#6查看此df的前三行数据
df.head(3)
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
#7选择df中列标签为animal和age的数据
df1 = df[['animal','age']]
df1
|
animal |
age |
a |
cat |
2.5 |
b |
cat |
3.0 |
c |
snake |
0.5 |
d |
dog |
NaN |
e |
dog |
5.0 |
f |
cat |
2.0 |
g |
snake |
4.5 |
h |
cat |
NaN |
i |
dog |
7.0 |
j |
dog |
3.0 |
#8选择行为[3, 4, 8],且列为['animal', 'age']中的数据
#本题的难点在于当行索引被命名为非数字时不能再使用loc按照数字取值,如果使用iloc就不能用名称取列,而本题的要求是按数字取行,按名称取列
#解法一:分两次取值
df2 = df.iloc[[3,4,8],:][['animal', 'age']]
#解法二:取行索引然后按索引取
df2 = df.loc[df.index[[3,4,8]],['animal', 'age']]
#错误用法:
# df2 = df.loc[[3,4,8],['animal', 'age']]
df2
|
animal |
age |
d |
dog |
NaN |
e |
dog |
5.0 |
i |
dog |
7.0 |
#9选择visuts大于2的动物种类
df
bool_s=df.visits>2
df[bool_s]['animal']
b cat
d dog
f cat
Name: animal, dtype: object
#10选择age为缺失值的行
df[df.age.isnull()]
df[df['age'].isnull()]
|
animal |
age |
visits |
priority |
d |
dog |
NaN |
3 |
yes |
h |
cat |
NaN |
1 |
yes |
#11选择animal为cat,且age小于3的行
df[(df['animal']=='cat') & (df['age']<3)]
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
f |
cat |
2.0 |
3 |
no |
#12选择age在2到4之间的数据(包含边界值)
df[(df['age']>=2) &(df['age']<=4)]
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
f |
cat |
2.0 |
3 |
no |
j |
dog |
3.0 |
1 |
no |
#13将f行的age改为1.5
# df.loc['f']['age']=1.5 这个操作不规范会警报
df.loc['f','age']=1.5
#14计算visits列的数据总和
res = df['visits'].sum()
res
19
#15计算每种animal的平均age
df.groupby('animal')['age'].mean()
animal
cat 2.333333
dog 5.000000
snake 2.500000
Name: age, dtype: float64
#16追加一行(k),列的数据自定义,然后再删除新追加的k行
df.loc['k']=df.loc['a'].values
df.drop('k',inplace=True) #删除行
# del df['k'] #删除列
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
d |
dog |
NaN |
3 |
yes |
e |
dog |
5.0 |
2 |
no |
f |
cat |
2.0 |
3 |
no |
g |
snake |
4.5 |
1 |
no |
h |
cat |
NaN |
1 |
yes |
i |
dog |
7.0 |
2 |
no |
j |
dog |
3.0 |
1 |
no |
#17计算每种animal的个数(cat有几个,dog几个...)
df.groupby('animal').size()
df['animal'].value_counts()
# df['animal'].unique() #查看动物的种类
# df['animal'].nunique() #查看动物种类的个数
dog 4
cat 4
snake 2
Name: animal, dtype: int64
#18先根据age降序排列,再根据visits升序排列
df.sort_values(by=['age','visits'],ascending=[False,True])
|
animal |
age |
visits |
priority |
i |
dog |
7.0 |
2 |
no |
e |
dog |
5.0 |
2 |
no |
g |
snake |
4.5 |
1 |
no |
j |
dog |
3.0 |
1 |
no |
b |
cat |
3.0 |
3 |
yes |
a |
cat |
2.5 |
1 |
yes |
f |
cat |
2.0 |
3 |
no |
c |
snake |
0.5 |
2 |
no |
h |
cat |
NaN |
1 |
yes |
d |
dog |
NaN |
3 |
yes |
#19将priority列的yes和no用True和False替换
#这题似乎有点问题,不同类型的数据类型似乎不能相互转换
df['priority'] = df['priority'].astype(bool).map({'yes': True, 'no': False})
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
NaN |
b |
cat |
3.0 |
3 |
NaN |
c |
snake |
0.5 |
2 |
NaN |
d |
dog |
NaN |
3 |
NaN |
e |
dog |
5.0 |
2 |
NaN |
f |
cat |
2.0 |
3 |
NaN |
g |
snake |
4.5 |
1 |
NaN |
h |
cat |
NaN |
1 |
NaN |
i |
dog |
7.0 |
2 |
NaN |
j |
dog |
3.0 |
1 |
NaN |
#20 将animal列的snake用python替换
df['animal'].replace('snake','python',inplace=True)
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
NaN |
b |
cat |
3.0 |
3 |
NaN |
c |
python |
0.5 |
2 |
NaN |
d |
dog |
NaN |
3 |
NaN |
e |
dog |
5.0 |
2 |
NaN |
f |
cat |
2.0 |
3 |
NaN |
g |
python |
4.5 |
1 |
NaN |
h |
cat |
NaN |
1 |
NaN |
i |
dog |
7.0 |
2 |
NaN |
j |
dog |
3.0 |
1 |
NaN |
#21对于每种动物类型和每种访问次数,求出平均年龄。换句话说,每一行都是动物,每一列都是访问次数,其值是平均年龄(提示:使用数据透视表)
# df.groupby(['animal','visits'])['age'].mean().reset_index()
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')
visits |
1 |
2 |
3 |
animal |
|
|
|
cat |
2.5 |
NaN |
2.5 |
dog |
3.0 |
6.0 |
NaN |
python |
4.5 |
0.5 |
NaN |