关闭页面特效

有监督的卡方分箱算法

0|1实现代码


import numpy as np import pandas as pd from collections import Counter def chimerge(data, attr, label, max_intervals): distinct_vals = sorted(set(data[attr])) # Sort the distinct values labels = sorted(set(data[label])) # Get all possible labels empty_count = {l: 0 for l in labels} # A helper function for padding the Counter() intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))] # Initialize the intervals for each attribute while len(intervals) > max_intervals: # While loop chi = [] for i in range(len(intervals)-1): # Calculate the Chi2 value obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])] obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])] total = len(obs0) + len(obs1) count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()]) count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()]) count_total = count_0 + count_1 expected_0 = count_total*sum(count_0)/total expected_1 = count_total*sum(count_1)/total chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1 chi_ = np.nan_to_num(chi_) # Deal with the zero counts chi.append(sum(chi_)) # Finally do the summation for Chi2 min_chi = min(chi) # Find the minimal Chi2 for current iteration for i, v in enumerate(chi): if v == min_chi: min_chi_index = i # Find the index of the interval to be merged break new_intervals = [] # Prepare for the merged new data array skip = False done = False for i in range(len(intervals)): if skip: skip = False continue if i == min_chi_index and not done: # Merge the intervals t = intervals[i] + intervals[i+1] new_intervals.append([min(t), max(t)]) skip = True done = True else: new_intervals.append(intervals[i]) intervals = new_intervals for i in intervals: print('[', i[0], ',', i[1], ']', sep='')

0|1使用例子


iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) iris.columns = ['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'type'] for attr in ['sepal_l', 'sepal_w', 'petal_l', 'petal_w']: print('Interval for', attr) chimerge(data=iris, attr=attr, label='type', max_intervals=3)

结果:


__EOF__

作  者Hichens
出  处https://www.cnblogs.com/hichens/p/13585854.html
关于博主:莫得感情的浅度学习机器人
版权声明:@Hichens
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角推荐一下。您的鼓励是博主的最大动力!

posted @   hichens  阅读(563)  评论(0编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示