数据清洗和准备
离散化和面元划分
6 8 9 4 ===> 2 3 4 1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats.categories
ntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
pd.cut(ages,[18,26,36,61,100],right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
names = ['青年','年轻人','中年','老年']
pd.cut(ages,bins,labels=names)
[青年, 青年, 青年, 年轻人, 青年, ..., 年轻人, 老年, 中年, 中年, 年轻人]
Length: 12
Categories (4, object): [青年 < 年轻人 < 中年 < 老年]
data = np.random.rand(20)
data
array([ 0.04996272, 0.09751859, 0.93201166, 0.52240638, 0.02292138,
0.93153349, 0.51292955, 0.04350894, 0.58364788, 0.44534584,
0.31083907, 0.9286763 , 0.66816617, 0.77377502, 0.18961133,
0.66365819, 0.23383481, 0.53767344, 0.64420233, 0.67658029])
pd.cut(data,4,precision = 2)
[(0.022, 0.25], (0.022, 0.25], (0.7, 0.93], (0.48, 0.7], (0.022, 0.25], ..., (0.48, 0.7], (0.022, 0.25], (0.48, 0.7], (0.48, 0.7], (0.48, 0.7]]
Length: 20
Categories (4, interval[float64]): [(0.022, 0.25] < (0.25, 0.48] < (0.48, 0.7] < (0.7, 0.93]]
data = np.random.randn(1000)
data
array([ 4.85634729e-02, -1.34054158e+00, 2.72231862e-02,
2.46942122e-01, 7.44757369e-01, 2.04112537e+00,
2.88554056e-01, -1.59376789e-01, 1.03820893e+00,
2.34362566e-01, 1.26033030e-01, -5.30489341e-01,
3.35935612e-01, -1.28030309e+00, -1.82161864e+00,
1.24622137e+00, 1.79109860e+00, -1.11492088e+00,
-2.72757886e-01, 2.00095126e+00, 6.77932950e-02,
-1.61718635e+00, 8.86037558e-01, -3.68608873e-01,
-4.87571678e-01, 1.07434758e-02, 9.03368472e-01,
-2.00666200e+00, -4.62522278e-01, -3.19588645e-01,
......
cats = pd.qcut(data,4)
cats
[(-0.0313, 0.704], (-3.031, -0.727], (-0.0313, 0.704], (-0.0313, 0.704], (0.704, 3.386], ..., (-0.727, -0.0313], (-3.031, -0.727], (-0.727, -0.0313], (-3.031, -0.727], (0.704, 3.386]]
Length: 1000
Categories (4, interval[float64]): [(-3.031, -0.727] < (-0.727, -0.0313] < (-0.0313, 0.704] < (0.704, 3.386]]
pd.value_counts(cats)
(0.704, 3.386] 250
(-0.0313, 0.704] 250
(-0.727, -0.0313] 250
(-3.031, -0.727] 250
dtype: int64
cats = pd.qcut(data,[0,0.1,0.5,0.9,1.])
pd.value_counts(cats)
(-0.0313, 1.309] 400
(-1.382, -0.0313] 400
(1.309, 3.386] 100
(-3.031, -1.382] 100
dtype: int64
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· DeepSeek在M芯片Mac上本地化部署
· 葡萄城 AI 搜索升级:DeepSeek 加持,客户体验更智能