离散化和面元划分

from pandas import DataFrame,Series
import pandas as pd
import numpy as np

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)

print(cats)
'''
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
'''
print(cats.codes)
'''
[0 0 0 1 0 0 2 1 3 2 2 1]
'''
print(cats.categories)
'''
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
'''

# 跟区间的数学符号意义,圆括号表示不包含,方括号包含,可通过right=False进行修改
print(pd.cut(ages,[18,26,36,61,100],right=False))
'''
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
'''
# 可自定义面元名称
group_names = ["AAA","BBB","CCC","DDD"]
print(pd.cut(ages,bins,labels=group_names))
'''
['AAA', 'AAA', 'AAA', 'BBB', 'AAA', ..., 'BBB', 'DDD', 'CCC', 'CCC', 'BBB']
Length: 12
Categories (4, object): ['AAA' < 'BBB' < 'CCC' < 'DDD']
'''
# 如果cut传入的是面元的数量而不是准确的面元边界,他会根据最小值和最大值计算等长面元
print(pd.cut(ages,4))
'''
[(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61.0], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]]
Length: 12
Categories (4, interval[float64]): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61.0]]
'''
# qcut得到大小基本相等的面元
ages_qcut = pd.qcut(ages,4)
print(ages_qcut)
'''
[(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]]
Length: 12
Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]]
'''
print(pd.value_counts(ages_qcut))
'''
(19.999, 22.75]    3
(22.75, 29.0]      3
(29.0, 38.0]       3
(38.0, 61.0]       3
dtype: int64
'''

 

posted @ 2021-07-01 16:00  OTAKU_nicole  阅读(52)  评论(0编辑  收藏  举报