数据离散化和归一化
数据离散化和归一化
在进行数据分析时,通常需要对数据进行归一化和离散化的操作
from pylab import * from numpy import * import codecs import matplotlib.pyplot as plt import operator #新加了一个库,用于排序 import pandas as pd from numpy.random import random from sklearn import preprocessing url = "resultData.txt" nmi_all=[] #存储所有的互信息的值 data_number = 0 #用于计数 FeatureNum=6 #定义待读取数据的特征数量 data_num = 100 #一百条数据 data = [] def open_file(url): with codecs.open(url, "r") as f: tmp = [] for line in f.readlines(): line1=line.strip() line2=line1.split(',') for i in range(0, FeatureNum): tmp.append(float(line2[i])) data.append(tmp) tmp = [] datas = array(data) def gui_yi_hua(data): min_max_scaler = preprocessing.MinMaxScaler() tseg_minMax = min_max_scaler.fit_transform(data) return(tseg_minMax) #tseg_out = pd.DataFrame(tseg_minMax) #tseg_out.to_csv('tseg_out.csv') def arry_discretization(tseg_minMax): for tmp in tseg_minMax: print(tmp) ages=tmp bins = [0,0.25,0.5,0.75,1] group_names=['这个属于0-0.25','这个属于0.25-0.5','这个属于0.5-0.75','这个属于0.75-1'] cuts=pd.cut(ages,bins,labels=group_names) print(cuts) print(pd.value_counts(cuts)) if __name__ == '__main__': open_file(url) arry_discretization(gui_yi_hua(data))
风雨兼程,前程可待!