代码
#!/usr/bin/python # -*- coding:utf-8 -*- import pandas as pd import numpy as np import matplotlib as mpl import math import warnings import gc from gensim import corpora, models, similarities from sklearn.preprocessing import LabelEncoder import datetime as dt from pandas.tseries.offsets import Day,MonthEnd,MonthBegin from multiprocessing import Pool from dask import dataframe as dd from dask.multiprocessing import get from multiprocessing import cpu_count import jieba mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['font.serif'] = ['SimHei'] warnings.filterwarnings("ignore") def getlda(doc_topics, x, num_show_topic, col): ''' :param doc_topics: 主题 :param x: 样本 :param num_show_topic:主题个数 :param col: 列名 :return: ''' # print(x,len(doc_topics)) topic = np.array(doc_topics[x]) topic_id=topic[np.argsort(topic[:,1])] if topic_id.shape[0]<num_show_topic: settopici=set(topic_id[:,0]) settopicadd=set([x for x in range(num_show_topic)])-settopici# 补上没出现的topic dfall=pd.concat([pd.DataFrame({0:list(settopicadd),1:[0 for x in range(len(settopicadd))]}),pd.DataFrame(topic_id)],axis=0) else: dfall=pd.DataFrame(topic_id) dfall.sort_values(0,inplace=True)#0~num_show_topic 个主题所占概率 df =pd.DataFrame([dfall[1].values])# 第i个主题概率 df=df.astype(np.float32) L = range(num_show_topic) df.columns = [col + 'lda' + str(i) for i in L] return df # 读取数据 test = pd.read_csv('../data/age_test.csv', header=None) test.columns = ['uid'] train = pd.read_csv('../data/age_train.csv', header=None) train.columns = ['uid', 'label'] app_actived = pd.read_csv('../data/app_actived.csv', header=None) app_actived.columns = ['uid', 'appid'] print(app_actived.shape) print(train.shape) print(test.shape) # 拆分app_actived表 test_actived = pd.merge(test, app_actived, on='uid', how='left') train_actived = pd.merge(train, app_actived, on='uid', how='left') print(test_actived.shape) print(train_actived.shape) # # 预处理user_taglist这张表 user_taglist = pd.read_csv('/home/sxtj/han/PPAI/data/user_taglist.csv', parse_dates=['insertdate'], ) print(user_taglist.shape) # tfidf要从总体提取每个特征提取一个weight 这部分有穿越! columstfidf = ['taglist'] def fundic(x): x = x.split('|') return x print('processing taglist') for index, item in enumerate(columstfidf): # 做成文本 testdata = list(user_taglist[item].map(lambda x: fundic(x))) user_taglist.drop(item, axis=1, inplace=True) dictionary = corpora.Dictionary(testdata) corpus = [dictionary.doc2bow(text) for text in testdata] corpus_tfidf = models.TfidfModel(corpus)[corpus]# tfidf # weight = corpus_tfidf.obj.idfs lda = models.LdaMulticore(corpus_tfidf, num_topics=100, id2word=dictionary, chunksize=2000, passes=1, random_state=0, minimum_probability=0.005, workers=11) # lda.save('./model/' + item + '_ldanew.model') # 留给test集合用 doc_topics = lda.get_document_topics(corpus_tfidf) # 提取前num_topicsuese个主题 print('num_topicsuese……') dfjoin = pd.concat([cols for cols in user_taglist.reset_index()['index'].apply(lambda x: getlda(doc_topics, x, 100, item))], ignore_index=True) # 前80个主题 print(dfjoin.head()) # print(dfjoin.head(5)) # print('saving taglist') # # dfjoin.to_hdf('dfjoin.h5','dfjoin') # del testdata,dictionary,corpus,corpus_tfidf,doc_topics # gc.collect() # user_taglistfe = user_taglist.join(dfjoin) # del dfjoin,user_taglist # gc.collect()