计算指标/哑变量
from pandas import DataFrame,Series import pandas as pd import numpy as np # 如果一个DataFrame的某一列中含有K个不同值,则可以派生出一个K列矩阵 df = DataFrame({'key':['b','b','a','c','a','b'], 'data':range(6)}) print(df) ''' key data 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5 ''' dummies = pd.get_dummies(df['key'],prefix='key') print(dummies) ''' key_a key_b key_c 0 0 1 0 1 0 1 0 2 1 0 0 3 0 0 1 4 1 0 0 5 0 1 0 ''' df_with_dummies = df[['data']].join(dummies) print(df_with_dummies) ''' data key_a key_b key_c 0 0 0 1 0 1 1 0 1 0 2 2 1 0 0 3 3 0 0 1 4 4 1 0 0 5 5 0 1 0 '''
from pandas import DataFrame,Series import pandas as pd import numpy as np mnames = ['movie_id','title','genres'] movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames, encoding='ISO-8859-1',engine='python') #print(movies[:10]) genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) # set.union选取并集,重复元素只会出现一次 #print(genres[:10]) # 创建一个多行多列全0的DataFrame dummies = DataFrame(np.zeros((len(movies),len(genres))),columns=genres) for i,gen in enumerate(movies.genres): indices = dummies.columns.get_indexer(gen.split('|')) # 返回拆分结果在dummies中的排行 dummies.iloc[i,indices] = 1 movies_windic = movies.join(dummies.add_prefix('Genre_')) print(movies_windic.iloc[0].head(10)) # 取第1行的前10列
import pandas as pd import numpy as np values = np.random.rand(10) print(values) ''' [0.07858525 0.87300262 0.35604229 0.93110966 0.79934318 0.08215684 0.96897297 0.3661382 0.22688337 0.50674505] ''' bins = [0,0.2,0.4,0.6,0.8,1] print(pd.get_dummies(pd.cut(values,bins))) ''' (0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0] 0 1 0 0 0 0 1 0 0 0 0 1 2 0 1 0 0 0 3 0 0 0 0 1 4 0 0 0 1 0 5 1 0 0 0 0 6 0 0 0 0 1 7 0 1 0 0 0 8 0 1 0 0 0 9 0 0 1 0 0 '''
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/15194668.html