网站更新内容:请访问: https://bigdata.ministep.cn/

Pandas基础API

数据管理

演示数据集

# Create a dataframe
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan], 
        'nationality': ['USA', 'USA', 'France', 'UK', 'UK'], 
        'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df
# Select all cases where the first name is not missing and nationality is USA 
df[df['first_name'].notnull() & (df['nationality'] == "USA")]
first_name nationality age
0 Jason USA 42
1 Molly USA 52

基本操作

唯一值unique

# List unique values in a DataFrame column
df['Column Name'].unique()

类型转换

###  Convert Series datatype to numeric (will error if column has non-numeric values)
pd.to_numeric(df['Column Name'])
### Convert Series datatype to numeric, changing non-numeric values to NaN
将Series数据类型转换为数字,将非数字值更改为NaN
pd.to_numeric(df['Column Name'], errors='coerce')
####  更改数据类型
# Change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)

变量重编码(重赋值)

# Set DataFrame column values based on other column values (h/t: @mlevkov)
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value

创建新变量

# Concatenate two DataFrame columns into a new, single column
# (useful when dealing with composite keys, for example)
# (h/t @makmanalp for improving this one!)
df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str)
df.a.astype(float).fillna(0.0)
s = pd.Series(['apple', '1.0', '2', -3])
pd.to_numeric(s, errors='ignore') #忽略
pd.to_numeric(s, errors='coerce') #强制转换成NaN
# Doing calculations with DataFrame columns that have missing values
# In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

重命名rename

# Rename several DataFrame columns
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
})

# Lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)

# Even more fancy DataFrame column re-naming
# lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

排序

# Sort dataframe by multiple columns
df = df.sort_values(['col1','col2','col3'],ascending=[1,1,0])

行数,列数

## get-the-row-count-of-a-pandas-dataframe
import numpy as np
import pandas as pd
Count_Row=df.shape[0] #gives number of row count
Count_Col=df.shape[1] #gives number of col count
#补充:其他方法
df = pd.DataFrame(np.arange(12).reshape(4,3))
df.shape
#or
df[0].count()
#or
len(df.index)

不用科学记数法

# Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# To display with commas and no decimals
pd.options.display.float_format = '{:,.0f}'.format

where rows 值筛选

# Grab DataFrame rows where column = a specific value
df = df.loc[df.column == 'somevalue']


# Grab DataFrame rows where column value is present in a list
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]


# Grab DataFrame rows where column value is not present in a list
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]


# Select from DataFrame using criteria from multiple columns
# (use `|` instead of `&` to do an OR)
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]


select column 选择列

# Delete column from DataFrame
del df['column']
## 选择行
select_columns = ['city','date_housecode','title','flood','priceInfo','totalPrice','unitPrice']
table = df[select_columns]

循环行Loop through rows

# Loop through rows in a DataFrame
# (if you must)
for index, row in df.iterrows():
    print index, row['some column']
    
# Much faster way to loop through DataFrame rows
# if you can work with tuples
# (h/t hughamacmullaniv)
for row in df.itertuples():
    print(row)

聚合groupby

groupby细节

  • count(),sum() 默认不会计算nan值
  • groupby( , as_index=False) 加入as_index=False,可以使groupby的列变成正常的值,不再是索引值
# Get top n for each group of columns in a sorted dataframe
# (make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

行数count of rows

# Get quick count of rows in a DataFrame
len(df.index)

count or count distinct

#size跟count的区别: size计数时包含NaN值,而count不包含NaN值
df.groupby(["col1,col2"], as_index=False)['cola','colb'].count()
df.groupby(['col1', 'col2'], as_index=False).size().reset_index(name='counts')
#
import pandas as pd
import numpy as np
df = pd.DataFrame([['a',1],['a',np.nan],['a',3],['b',3]], columns=['id', 'value'])
df
df.groupby(["id"], as_index=False)['value'].count()
df.groupby(["id"], as_index=False)['value'].sum()
df.groupby(["id"], as_index=False)['value'].size()

count disticnt

# count(distinct )
df.groupby('col1', as_index=False).CLIENTCODE.nunique()
#other ways
df.groupby('col1', as_index=False).agg({'col1': pd.Series.nunique})  
#ways
df.groupby('col1', as_index=False)['col2'].nunique()
#参考资料:https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct

不同的列应用多种不同的统计方法

df.groupby('col1').agg({'col2':[np.mean, 'sum'], 'col3':['count','count']})

异常值处理

null/notnull 处理

# Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]

# Change all NaNs to None (useful before
# loading to a db)
df = df.where((pd.notnull(df)), None)

替换 replace()及部分替换

## replace()是很好的方法
#1.基本结构:#
df.replace(to_replace, value) 前面是需要替换的值,后面是替换后的值。
df.replace('A',0.1,inplace=True)
##如果需要改变原数据,需要添加常用参数 inplace=True
df['金额'].replace(Na,0,inplace=True)
#3. 用字典形式替换多个值。
df.replace({'C':0,'F':0})
#4. 使用正则表达式替换多个
df.replace('[A-Z]',0,regex=True)
#5.某个数据的部分内容
df['名称'].str.replace('产品','Product')
# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
### dropping-infinite-values-from-dataframes-in-pandas

##参考资料 https://stackoverflow.com/questions/17477979/dropping-infinite-values-from-dataframes-in-pandas/35783766

异常值填充

# Clean up missing values in multiple DataFrame columns
df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
})

applymap 清理

# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace
# from strings and changing any empty values to None
# (not especially recommended but including here b/c I had to do this in real life one time)
df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

Reshaping in Pandas

Pivot, Pivot-Table, Stack and Unstack explained with Pictures

参考资料:Pandas中的数据重塑(reshape)功能 http://datartisan.com/article/detail/104.html

from collections import OrderedDict
from pandas import DataFrame
import pandas as pd
import numpy as np

table = OrderDict((
    ("Item",['Item0','Item0','Item1','Item1']),
    ("CType",['Gold','Bronze','Gold','Silver']),
    ("USD",['1$','2$','3$','4$']),
    ("EU",['1€','2€','3€','4€'])
))
d = DataFrame(table)

#pivot,pivot_table的区别是,columns的值不能有重复值,有重复值,则需要使用pivot_table

Stack/Unstack

#轴向旋转(pivot)运算是堆叠(stack)过程的特例。
#首先假设原始数据集中的行列索引中均为层次索引。
#stack 过程表示将数据集的列旋转为行,同理 unstack 过程表示将数据的行旋转为列。
#因此,我们可以发现 stack 使得数据集变得更长,unstack 使得数据集变得更宽。
# Pivot data (with flexibility about what what
# becomes a column and what stays a row).
# Syntax works on Pandas >= .14
pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
  columns=['col4']) #data values in this column become their own column

apply

apply lambda if 使用

df['盘存']=df['盗损/报损数量'].apply(lambda x:'盘亏' if x >= 0 else '盘盈')

apply 除法运算

df_day['动销率'] = df_day['动销点位数'].apply(lambda x:float(x))/df_day['运营点位数'].apply(lambda x:float(x))

日期

常见日期

from datetime import date, timedelta,datetime
today = date.today()
yesterday = date.today() - timedelta(1)
yesterday = yesterday.strftime('%Y-%m-%d')

日期 <==>字符串

# 日期 to 字符串
##方法一
datetime.now().strftime('%Y-%m-%d')
##数据框
pd.to_datetime(df['date'])

# 字符串 to 日期
str_date ='2018-12-12'
datetime.strptime(str_date,'%Y-%m-%d')
# 数据框
df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
datetime.datetime(2018, 12, 12, 0, 0)

IO数据集导入或者导出

字典数据导入

# Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

posted @ 2021-04-04 20:26  ministep88  阅读(162)  评论(1编辑  收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/