离散化和面元划分
from pandas import DataFrame,Series import pandas as pd import numpy as np ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18,25,35,60,100] cats = pd.cut(ages,bins) print(cats) ''' [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]] ''' print(cats.codes) ''' [0 0 0 1 0 0 2 1 3 2 2 1] ''' print(cats.categories) ''' IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], closed='right', dtype='interval[int64]') ''' # 跟区间的数学符号意义,圆括号表示不包含,方括号包含,可通过right=False进行修改 print(pd.cut(ages,[18,26,36,61,100],right=False)) ''' [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)] Length: 12 Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)] ''' # 可自定义面元名称 group_names = ["AAA","BBB","CCC","DDD"] print(pd.cut(ages,bins,labels=group_names)) ''' ['AAA', 'AAA', 'AAA', 'BBB', 'AAA', ..., 'BBB', 'DDD', 'CCC', 'CCC', 'BBB'] Length: 12 Categories (4, object): ['AAA' < 'BBB' < 'CCC' < 'DDD'] ''' # 如果cut传入的是面元的数量而不是准确的面元边界,他会根据最小值和最大值计算等长面元 print(pd.cut(ages,4)) ''' [(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61.0], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]] Length: 12 Categories (4, interval[float64]): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61.0]] ''' # qcut得到大小基本相等的面元 ages_qcut = pd.qcut(ages,4) print(ages_qcut) ''' [(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]] Length: 12 Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]] ''' print(pd.value_counts(ages_qcut)) ''' (19.999, 22.75] 3 (22.75, 29.0] 3 (29.0, 38.0] 3 (38.0, 61.0] 3 dtype: int64 '''
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/14959191.html