Pandas基础API

数据管理

演示数据集

# Create a dataframe
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan], 
        'nationality': ['USA', 'USA', 'France', 'UK', 'UK'], 
        'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df
# Select all cases where the first name is not missing and nationality is USA 
df[df['first_name'].notnull() & (df['nationality'] == "USA")]

	first_name	nationality	age
0	Jason	USA	42
1	Molly	USA	52

基本操作

唯一值unique

# List unique values in a DataFrame column
df['Column Name'].unique()

类型转换

###  Convert Series datatype to numeric (will error if column has non-numeric values)
pd.to_numeric(df['Column Name'])
### Convert Series datatype to numeric, changing non-numeric values to NaN
将Series数据类型转换为数字，将非数字值更改为NaN
pd.to_numeric(df['Column Name'], errors='coerce')
####  更改数据类型
# Change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)

变量重编码（重赋值）

# Set DataFrame column values based on other column values (h/t: @mlevkov)
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value

创建新变量

# Concatenate two DataFrame columns into a new, single column
# (useful when dealing with composite keys, for example)
# (h/t @makmanalp for improving this one!)
df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)
df.a.astype(float).fillna(0.0)
s = pd.Series(['apple', '1.0', '2', -3])
pd.to_numeric(s, errors='ignore') #忽略
pd.to_numeric(s, errors='coerce') #强制转换成NaN
# Doing calculations with DataFrame columns that have missing values
# In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

重命名rename

# Rename several DataFrame columns
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
})

# Lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)

# Even more fancy DataFrame column re-naming
# lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

排序

# Sort dataframe by multiple columns
df = df.sort_values(['col1','col2','col3'],ascending=[1,1,0])

行数,列数

## get-the-row-count-of-a-pandas-dataframe
import numpy as np
import pandas as pd
Count_Row=df.shape[0] #gives number of row count
Count_Col=df.shape[1] #gives number of col count
#补充：其他方法
df = pd.DataFrame(np.arange(12).reshape(4,3))
df.shape
#or
df[0].count()
#or
len(df.index)

不用科学记数法

# Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# To display with commas and no decimals
pd.options.display.float_format = '{:,.0f}'.format

where rows 值筛选

# Grab DataFrame rows where column = a specific value
df = df.loc[df.column == 'somevalue']


# Grab DataFrame rows where column value is present in a list
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]


# Grab DataFrame rows where column value is not present in a list
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]


# Select from DataFrame using criteria from multiple columns
# (use `|` instead of `&` to do an OR)
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

select column 选择列

# Delete column from DataFrame
del df['column']
## 选择行
select_columns = ['city','date_housecode','title','flood','priceInfo','totalPrice','unitPrice']
table = df[select_columns]

循环行Loop through rows

# Loop through rows in a DataFrame
# (if you must)
for index, row in df.iterrows():
    print index, row['some column']
    
# Much faster way to loop through DataFrame rows
# if you can work with tuples
# (h/t hughamacmullaniv)
for row in df.itertuples():
    print(row)

聚合groupby

groupby细节

count(),sum() 默认不会计算nan值
groupby( , as_index=False) 加入as_index=False，可以使groupby的列变成正常的值，不再是索引值

# Get top n for each group of columns in a sorted dataframe
# (make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

行数count of rows

# Get quick count of rows in a DataFrame
len(df.index)

count or count distinct

#size跟count的区别： size计数时包含NaN值，而count不包含NaN值
df.groupby(["col1,col2"], as_index=False)['cola','colb'].count()
df.groupby(['col1', 'col2'], as_index=False).size().reset_index(name='counts')
#
import pandas as pd
import numpy as np
df = pd.DataFrame([['a',1],['a',np.nan],['a',3],['b',3]], columns=['id', 'value'])
df
df.groupby(["id"], as_index=False)['value'].count()
df.groupby(["id"], as_index=False)['value'].sum()
df.groupby(["id"], as_index=False)['value'].size()

count disticnt

# count(distinct )
df.groupby('col1', as_index=False).CLIENTCODE.nunique()
#other ways
df.groupby('col1', as_index=False).agg({'col1': pd.Series.nunique})  
#ways
df.groupby('col1', as_index=False)['col2'].nunique()
#参考资料：https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct

不同的列应用多种不同的统计方法

df.groupby('col1').agg({'col2':[np.mean, 'sum'], 'col3':['count','count']})

异常值处理

null/notnull 处理

# Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]

# Change all NaNs to None (useful before
# loading to a db)
df = df.where((pd.notnull(df)), None)

替换 replace()及部分替换

## replace()是很好的方法
#1.基本结构：#
df.replace(to_replace, value) 前面是需要替换的值，后面是替换后的值。
df.replace('A',0.1,inplace=True)
##如果需要改变原数据，需要添加常用参数 inplace=True
df['金额'].replace(Na,0,inplace=True)
#3. 用字典形式替换多个值。
df.replace({'C':0,'F':0})
#4. 使用正则表达式替换多个
df.replace('[A-Z]',0,regex=True)
#5.某个数据的部分内容
df['名称'].str.replace('产品','Product')
# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

### dropping-infinite-values-from-dataframes-in-pandas

##参考资料 https://stackoverflow.com/questions/17477979/dropping-infinite-values-from-dataframes-in-pandas/35783766

异常值填充

# Clean up missing values in multiple DataFrame columns
df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
})

applymap 清理

# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
# from strings and changing any empty values to None
# (not especially recommended but including here b/c I had to do this in real life one time)
df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

Reshaping in Pandas

Pivot, Pivot-Table, Stack and Unstack explained with Pictures

参考资料:Pandas中的数据重塑（reshape）功能 http://datartisan.com/article/detail/104.html

from collections import OrderedDict
from pandas import DataFrame
import pandas as pd
import numpy as np

table = OrderDict((
    ("Item",['Item0','Item0','Item1','Item1']),
    ("CType",['Gold','Bronze','Gold','Silver']),
    ("USD",['1$','2$','3$','4$']),
    ("EU",['1€','2€','3€','4€'])
))
d = DataFrame(table)

#pivot，pivot_table的区别是，columns的值不能有重复值，有重复值，则需要使用pivot_table

Stack/Unstack

#轴向旋转(pivot)运算是堆叠(stack)过程的特例。
#首先假设原始数据集中的行列索引中均为层次索引。
#stack 过程表示将数据集的列旋转为行，同理 unstack 过程表示将数据的行旋转为列。
#因此，我们可以发现 stack 使得数据集变得更长，unstack 使得数据集变得更宽。

# Pivot data (with flexibility about what what
# becomes a column and what stays a row).
# Syntax works on Pandas >= .14
pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
  columns=['col4']) #data values in this column become their own column

apply

apply lambda if 使用

df['盘存']=df['盗损/报损数量'].apply(lambda x:'盘亏' if x >= 0 else '盘盈')

apply 除法运算

df_day['动销率'] = df_day['动销点位数'].apply(lambda x:float(x))/df_day['运营点位数'].apply(lambda x:float(x))

日期

常见日期

from datetime import date, timedelta,datetime
today = date.today()
yesterday = date.today() - timedelta(1)
yesterday = yesterday.strftime('%Y-%m-%d')

日期 <==>字符串

# 日期 to 字符串
##方法一
datetime.now().strftime('%Y-%m-%d')
##数据框
pd.to_datetime(df['date'])

# 字符串 to 日期
str_date ='2018-12-12'
datetime.strptime(str_date,'%Y-%m-%d')
# 数据框
df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

datetime.datetime(2018, 12, 12, 0, 0)

IO数据集导入或者导出

字典数据导入

# Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

posted @ 2021-04-04 20:26 ministep88 阅读(163) 评论(1) 编辑收藏举报

刷新页面返回顶部

bigdata.ministep.cn

网站已迁移到：https://bigdata.ministep.cn/

Pandas基础API

数据管理

演示数据集

基本操作

唯一值unique

类型转换

变量重编码（重赋值）

创建新变量

重命名rename

排序

行数,列数

不用科学记数法

where rows 值筛选

select column 选择列

循环行Loop through rows

聚合groupby

groupby细节

行数count of rows

count or count distinct

count disticnt

不同的列应用多种不同的统计方法

异常值处理

null/notnull 处理

替换 replace()及部分替换

异常值填充

applymap 清理

Reshaping in Pandas

Pivot, Pivot-Table, Stack and Unstack explained with Pictures

Stack/Unstack

apply

apply lambda if 使用

apply 除法运算

日期

常见日期

日期 <==>字符串

IO数据集导入或者导出

字典数据导入

公告