特征列 属性值 获取 vowpal wabbit 生成DNN 的训练测试数据
用户特征文件 userFeature.data 每 行 代 表 一 个 用 户 的 特 征 数 据, 格 式 为: “uid|features”,uid 和 features 用竖线“|”分隔。其中 feature 采用 vowpal wabbit(https://github.com/JohnLangford/vowpal_wabbit)格式: “feature_group1|feature_group2|feature_group3|...”。 每 个 feature_group 代表一个特征组,多个特征组之间也以竖线“|”分隔。一个特征组若包括多个值 则以空格分隔,格式为:“feature_group_name fea_name1 fea_name2 …”, 其中 fea_name 采用数据编号的格式。
特征列 属性值 获取
d = {} with open(f, 'r') as fr: for i in fr: l = i.split('|') for ii in l: ll=ii.split(' ') k=ll[0] if k not in d: d[k]=[] for iii in ll[1:]: iii_=int(iii) if int(iii) not in d[k]: d[k].append(iii_) for k in d: l=sorted(d[k],reverse=False) print(k) print(l)
批处理 减小运行时间
d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000 with open(f, 'r') as fr: for i in fr: l = i.split('|') for ii in l: ll = ii.split(' ') k = ll[0] if k not in d: d[k] = [] for iii in ll[1:]: iii_ = int(iii) d[k].append(iii_) reduce_chk_counter += 1 if reduce_chk_counter == reduce_chk_step: reduce_chk_counter = 0 for k in d: d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) for k in d: d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) res_f = 'toknowit.txt' with open(res_f, 'w') as fw: for k in d: l = sorted(d[k], reverse=False) print(k) print(l) s = '{}\n{}\n'.format(k, ','.join(l)) fw.write(s)
将数据冗余至内存,批处理,去重,释放内存
f = 'userFeature.data' # 确定marriageStatus特征列的值 d, d1 = {}, {} with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: if 'marriageStatus' in ii: k = len(ii) d[k] = ii k = ii.split('marriageStatus')[-1] d1[k] = ii for k in d: print(k, d[k]) for k in d1: print(k, d1[k]) ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' d = {} with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: ll = ii.split(' ') k = ll[0] if k not in d: d[k] = [] for iii in ll[1:]: iii_ = int(iii) if int(iii) not in d[k]: d[k].append(iii_) d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000 with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: ll = ii.split(' ') k = ll[0] if k == 'uid': continue if k not in d: d[k] = [] for iii in ll[1:]: iii_ = int(iii) d[k].append(iii_) reduce_chk_counter += 1 if reduce_chk_counter == reduce_chk_step: reduce_chk_counter = 0 for k in d: d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) for k in d: break d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) res_f = 'toknowit.txt' # with open(res_f, 'w') as fw: # for k in d: # l = sorted(d[k], reverse=False) # print(k) # print(l) # s = '{}\n{}\n'.format(k, ','.join([str(i) for i in l])) # fw.write(s) cut_l = [] with open(res_f, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',')[0:200] cut_l.append(l) res_f_cut = 'toknowitCUT.txt' with open(res_f_cut, 'w') as fw: for l in cut_l: s = '{}\n'.format(','.join([str(i) for i in l])) fw.write(s)
age 0,1,2,3,4,5 gender 0,1,2 marriageStatus 0,2,3,5,6,8,9,10,11,12,13,14,15 education 0,1,2,3,4,5,6,7 consumptionAbility 0,1,2 LBS 0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,23,25,26,27,29,30,31,32,33,35,36,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,60,61,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,94,95,97,98,99,100,101,102,103,104,105,106,107,108,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,135,137,138,139,142,143,144,145,146,147,149,150,151,152,153,154,155,157,158,159,160,161,163,165,168,170,171,172,173,174,175,176,177,178,179,180,181,183,184,185,186,188,189,190,191,192,193,194,195,197,198,199,200,201,202,203,204,206,208,209,210,211,212,214,215,216,217,218,219,220,222,223,224,225,227,229,232,233,234,235,236 interest1 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122 interest2 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,78,79,80,81,82 interest5 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136 kw1 2,3,8,13,17,19,21,28,29,39,41,42,43,46,56,59,65,68,69,70,71,72,74,86,87,88,90,92,95,100,101,105,106,109,111,112,113,119,121,123,125,131,133,136,139,141,142,143,145,150,152,156,157,162,163,166,169,172,173,174,176,177,180,181,183,184,185,186,191,199,203,204,209,211,214,216,230,235,240,242,243,246,249,260,263,265,268,269,271,272,278,279,283,284,289,291,292,295,302,303,304,307,313,317,321,322,323,331,336,341,343,344,351,354,357,358,359,366,367,369,370,372,373,375,376,377,378,380,381,382,390,391,393,396,401,402,406,407,408,409,411,414,417,423,429,433,434,437,438,441,442,443,449,456,464,465,468,472,473,475,477,478,480,482,485,486,487,495,496,497,504,506,507,511,513,521,522,526,532,536,541,542,546,547,560,561,563,566,567,575,576,578,581,584,588,592,594,604,605,610 kw2 2,6,7,9,10,11,12,14,21,22,23,25,26,30,34,38,40,41,42,43,44,46,47,50,55,56,62,63,66,69,70,71,72,74,75,76,77,78,80,81,84,85,87,89,90,91,94,95,100,112,114,116,117,118,119,121,123,124,127,128,129,130,133,135,137,142,143,144,148,149,151,153,154,156,157,158,163,168,171,174,176,177,180,183,184,186,192,193,195,196,197,200,202,203,215,216,217,219,221,223,228,229,235,237,238,240,241,246,248,250,255,258,260,263,266,269,272,275,276,278,280,286,287,290,294,295,296,297,301,302,303,305,313,317,321,323,327,330,333,334,338,339,340,341,343,344,345,347,354,358,359,363,366,368,369,371,374,375,377,378,380,383,384,386,391,393,394,395,398,399,400,403,404,405,408,409,412,413,417,418,422,427,433,436,438,440,442,445,447,448,451,453,454,455,456,457,459,461,462,463 topic1 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199 topic2 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199 ct 0,1,2,3,4 os 0,1,2 carrier 0,1,2,3 house 1 interest3 1,2,3,4,5,6,7,8,9,10 kw3 1,7,8,10,15,19,25,27,29,36,50,56,63,68,69,74,77,80,88,93,95,101,117,122,123,124,126,132,133,136,138,149,151,152,153,155,157,164,167,171,173,174,181,186,188,190,192,194,197,198,206,209,213,223,228,233,235,248,249,253,263,273,276,278,280,286,288,295,302,303,311,314,316,323,328,331,332,333,336,343,349,362,364,366,370,372,381,385,391,394,399,401,404,411,412,416,420,425,427,431,453,459,464,465,469,470,474,488,499,504,505,508,512,513,523,530,531,534,539,549,559,560,563,566,568,570,574,581,586,588,598,607,610,617,627,630,633,634,635,636,638,645,650,654,655,657,663,668,676,677,681,685,686,687,691,692,694,695,696,699,701,703,705,707,709,719,722,723,725,734,735,737,739,740,742,745,751,755,763,764,769,771,780,785,788,799,800,805,809,818,821,833,835,836,840,851,853,856,860,862 topic3 1,3,4,8,10,11,14,16,18,19,21,22,23,24,25,27,28,30,31,32,33,34,35,37,39,42,43,44,46,47,49,51,53,54,55,56,58,59,60,62,63,65,66,68,69,70,72,75,76,78,79,81,84,87,88,90,92,95,98,99,100,101,102,103,107,108,109,111,112,113,115,116,117,119,120,121,123,124,126,127,129,130,132,133,136,137,138,139,141,142,143,146,148,150,151,154,157,158,159,161,162,164,165,166,167,168,169,171,174,176,177,178,180,182,183,185,186,187,188,190,191,192,193,194,197,198,199,201,202,205,206,207,209,210,211,212,213,214,215,216,217,218,219,220,221,223,226,227,228,232,233,234,235,237,238,240,241,243,251,252,253,255,256,258,259,260,262,264,265,266,267,268,269,270,271,272,273,274,275,278,279,280,282,283,285,287,288,292,297,298,299,301,304,305,306,307,308,309,312,314 interest4 1,2,3,4,5,6,7,8,9,10 appIdAction 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200 appIdInstall 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
res_f_cut = 'toknowitCUT.txt' # with open(res_f_cut, 'w') as fw: # s = '\n'.join([','.join([str(i) for i in l]) for l in cut_l]) # fw.write(s) sparse_num_drop_max, is_odd_line = 50, True with open(res_f_cut, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',') if is_odd_line == True: is_odd_line = False k = l[0] else: is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: s = '{}_{}'.format(k, ii) print(s) else: print(k)
age_0 age_1 age_2 age_3 age_4 age_5 gender_0 gender_1 gender_2 marriageStatus_0 marriageStatus_2 marriageStatus_3 marriageStatus_5 marriageStatus_6 marriageStatus_8 marriageStatus_9 marriageStatus_10 marriageStatus_11 marriageStatus_12 marriageStatus_13 marriageStatus_14 marriageStatus_15 education_0 education_1 education_2 education_3 education_4 education_5 education_6 education_7 consumptionAbility_0 consumptionAbility_1 consumptionAbility_2 LBS interest1 interest2 interest5 kw1 kw2 topic1 topic2 ct_0 ct_1 ct_2 ct_3 ct_4 os_0 os_1 os_2 carrier_0 carrier_1 carrier_2 carrier_3 house_1 interest3_1 interest3_2 interest3_3 interest3_4 interest3_5 interest3_6 interest3_7 interest3_8 interest3_9 interest3_10 kw3 topic3 interest4_1 interest4_2 interest4_3 interest4_4 interest4_5 interest4_6 interest4_7 interest4_8 interest4_9 interest4_10 appIdAction appIdInstall
['uid', 'age', 'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os', 'carrier', 'house'] ['age_0', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'gender_0', 'gender_1', 'gender_2', 'marriageStatus_0', 'marriageStatus_2', 'marriageStatus_3', 'marriageStatus_5', 'marriageStatus_6', 'marriageStatus_8', 'marriageStatus_9', 'marriageStatus_10', 'marriageStatus_11', 'marriageStatus_12', 'marriageStatus_13', 'marriageStatus_14', 'marriageStatus_15', 'education_0', 'education_1', 'education_2', 'education_3', 'education_4', 'education_5', 'education_6', 'education_7', 'consumptionAbility_0', 'consumptionAbility_1', 'consumptionAbility_2', 'LBS_0', 'interest1_0', 'interest2_0', 'interest3_1', 'interest3_2', 'interest3_3', 'interest3_4', 'interest3_5', 'interest3_6', 'interest3_7', 'interest3_8', 'interest3_9', 'interest3_10', 'interest4_1', 'interest4_2', 'interest4_3', 'interest4_4', 'interest4_5', 'interest4_6', 'interest4_7', 'interest4_8', 'interest4_9', 'interest4_10', 'interest5_0', 'kw1_0', 'kw2_0', 'kw3_0', 'topic1_0', 'topic2_0', 'topic3_0', 'appIdInstall_0', 'appIdAction_0', 'ct_0', 'ct_1', 'ct_2', 'ct_3', 'ct_4', 'os_0', 'os_1', 'os_2', 'carrier_0', 'carrier_1', 'carrier_2', 'carrier_3', 'house_1']
f = 'userFeature.data' ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' def fw_s(f, s): with open(f, 'w') as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # k_oreder_l 获取一级特征的顺序 res_f = 'toknowit.txt' res_f_k_order = res_f.replace('.', 'KeyOrder.') res_f_cut, children_val_max = res_f.replace('.', 'Cut.'), 50 to_write = True to_write = False if to_write: d, reduce_chk_counter, reduce_chk_step, k_oreder_l = {}, 0, 500000, [] with open(f, 'r') as fr: for i in fr: l = i.replace('\n', '').split('|') k_order_l_this = [] for ii in l: ll = ii.split(' ') k = ll[0] k_order_l_this.append(k) if k == 'uid': continue if k not in d: d[k] = [] # order -->int for iii in ll[1:]: d[k].append(int(iii)) k_oreder_l.append(k_order_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 for k in d: d[k] = list(set(d[k])) k_oreder_l = [e for i, e in enumerate(k_oreder_l) if k_oreder_l.index(e) == i] # set() TypeError unhashable type:'list' for k in d: d[k] = sorted(list(set(d[k])), reverse=False) k_oreder_l = [e for i, e in enumerate(k_oreder_l) if k_oreder_l.index(e) == i] s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in d[k]])) for k in d]) fw_s(res_f, s) s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in d[k][0:children_val_max]])) for k in d]) fw_s(res_f_cut, s) s = '\n'.join(['|'.join(l) for l in k_oreder_l]) fw_s(res_f_k_order, s) with open(res_f_k_order, 'r') as fr: ori_feature_l = [i.replace('\n', '').split('|') for i in fr] feature_after_e_d = {} for l in ori_feature_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) feature_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_l) import re feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in feature_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) sparse_num_drop_max, is_odd_line = 20, True # 特征的属性值 res_d = {} with open(res_f_cut, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',') if is_odd_line == True: is_odd_line = False k = l[0] res_d[k] = [] else: is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: res_d[k].append(ii) else: res_d[k].append(0) feature_expand_l = [] feature_l.pop(feature_l.index('uid')) for k in feature_l: feature_expand_l += ['{}_{}'.format(k, i) for i in res_d[k]] print(feature_expand_l) dd = 5
import re, time f = 'userFeature.data' ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' def fw_s(f, s): with open(f, 'w') as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # feature_order_l 获取一级特征的顺序 # 原始数据 特征挖掘 # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序, f_feature = 'toknowit.txt' f_feature_ele_num = f_feature.replace('.', 'EleNum.') f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.', 'Complete.') # 原始数据 加工成生产数据 # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定 f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.') # <---以上生成的文件无参数,数据恒定 # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳 f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time()))) to_write_immutable_file = True # to_write_immutable_file = False if to_write_immutable_file: feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000 # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合 with open(f, 'r') as fr: for i in fr: l = i.replace('\n', '').split('|') feature_incomplete_rows_l_this = [] for ii in l: ll = ii.split(' ') k = ll[0] feature_incomplete_rows_l_this.append(k) if k == 'uid': continue if k not in feature_d: feature_d[k] = [] # order -->int for iii in ll[1:]: feature_d[k].append(int(iii)) feature_incomplete_rows_l.append(feature_incomplete_rows_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数 for k in feature_d: feature_d[k] = list(set(feature_d[k])) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] # subset TEST break for k in feature_d: feature_d[k] = sorted(list(set(feature_d[k])), reverse=False) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] s = '\n'.join([','.join(l) for l in feature_incomplete_rows_l]) fw_s(f_feature_incomplete, s) feature_after_e_d = {} for l in feature_incomplete_rows_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) # 原始一级特征b feature_complete_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_complete_l) s = '\n'.join(feature_complete_l) fw_s(f_feature_complete, s) print(feature_complete_l) feature_d_ = {} for feature in feature_complete_l: if feature == 'uid': continue feature_d_[feature] = feature_d[feature] del feature_d feature_d = feature_d_ s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d]) fw_s(f_feature, s) s = '\n'.join(['{}\n{}'.format(k, len(feature_d[k])) for k in feature_d]) fw_s(f_feature_ele_num, s) # 原始数据持久化完毕<--- # --->对原始数据做加工,生成新数据 # 0级别特征,将原始一级特征中"kw1,kw2,"合并 feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in feature_complete_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) s = '\n'.join(feature_reduce_l) fw_s(f_reduce, s) relative_, absolute_ = 2 / 3, 50 sparse_num_drop_max = min( [absolute_, max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])]) s = '\n'.join( ['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d]) fw_s(f_output, s) # 特征的属性值 feature_extend_d = {} is_odd_line = True with open(f_output, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',') if is_odd_line == True: is_odd_line = False k = l[0] feature_extend_d[k] = [] else: is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: feature_extend_d[k].append(ii) else: feature_extend_d[k].append(0) feature_extend_l = [] feature_complete_l.pop(feature_complete_l.index('uid')) feature_extend_l = '|'.join( ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|') print(feature_extend_l) s = ','.join(feature_extend_l) fw_s(f_extend, s) # 生成缩小的数据集,测试spark join 广告特征做广播 # feature_extend_l, f_extend = [], 'toknowitExtend1526836898.txt' # # with open(f_extend, 'r') as fr: # feature_extend_l = [i.replace('\n', '') for i in fr][0].split(',') d = 8
age_0,age_1,age_2,age_3,age_4,age_5,gender_0,gender_1,gender_2,marriageStatus_0,marriageStatus_2,marriageStatus_3,marriageStatus_5,marriageStatus_6,marriageStatus_9,marriageStatus_10,marriageStatus_11,marriageStatus_12,marriageStatus_13,marriageStatus_14,marriageStatus_15,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,consumptionAbility_0,consumptionAbility_1,consumptionAbility_2,LBS_0,LBS_1,LBS_4,LBS_6,LBS_7,LBS_8,LBS_9,LBS_13,LBS_14,LBS_15,LBS_16,LBS_18,LBS_19,LBS_21,LBS_25,LBS_27,LBS_29,LBS_32,LBS_33,LBS_35,LBS_38,LBS_39,LBS_41,LBS_43,LBS_45,LBS_46,LBS_47,LBS_48,LBS_49,LBS_52,LBS_54,LBS_56,LBS_57,LBS_61,LBS_62,LBS_63,LBS_64,LBS_66,LBS_69,LBS_71,LBS_72,LBS_73,LBS_75,LBS_77,LBS_78,LBS_81,LBS_83,LBS_84,LBS_85,LBS_86,interest1_1,interest1_2,interest1_3,interest1_4,interest1_5,interest1_6,interest1_7,interest1_8,interest1_9,interest1_10,interest1_11,interest1_12,interest1_13,interest1_14,interest1_15,interest1_16,interest1_17,interest1_18,interest1_19,interest1_20,interest1_21,interest1_22,interest1_23,interest1_24,interest1_25,interest1_26,interest1_27,interest1_28,interest1_29,interest1_30,interest1_31,interest1_32,interest1_33,interest1_34,interest1_35,interest1_36,interest1_37,interest1_38,interest1_39,interest1_40,interest1_41,interest1_42,interest1_43,interest1_44,interest1_45,interest1_46,interest1_47,interest1_48,interest1_49,interest1_50,interest2_1,interest2_2,interest2_3,interest2_4,interest2_5,interest2_6,interest2_7,interest2_8,interest2_9,interest2_10,interest2_11,interest2_12,interest2_13,interest2_14,interest2_15,interest2_16,interest2_17,interest2_18,interest2_19,interest2_20,interest2_21,interest2_22,interest2_23,interest2_24,interest2_25,interest2_26,interest2_27,interest2_28,interest2_29,interest2_30,interest2_31,interest2_32,interest2_33,interest2_35,interest2_36,interest2_37,interest2_38,interest2_39,interest2_40,interest2_41,interest2_42,interest2_43,interest2_44,interest2_45,interest2_46,interest2_47,interest2_48,interest2_49,interest2_50,interest2_51,interest3_1,interest3_2,interest3_3,interest3_4,interest3_5,interest3_6,interest3_7,interest3_8,interest3_9,interest3_10,interest4_1,interest4_2,interest4_3,interest4_4,interest4_5,interest4_6,interest4_7,interest4_8,interest4_9,interest4_10,interest5_1,interest5_2,interest5_3,interest5_4,interest5_5,interest5_6,interest5_7,interest5_8,interest5_9,interest5_10,interest5_11,interest5_12,interest5_13,interest5_14,interest5_15,interest5_16,interest5_17,interest5_18,interest5_19,interest5_20,interest5_21,interest5_22,interest5_23,interest5_24,interest5_25,interest5_26,interest5_27,interest5_28,interest5_29,interest5_30,interest5_31,interest5_32,interest5_33,interest5_34,interest5_35,interest5_36,interest5_37,interest5_38,interest5_39,interest5_40,interest5_41,interest5_42,interest5_43,interest5_44,interest5_45,interest5_46,interest5_47,interest5_48,interest5_49,interest5_50,kw1_13,kw1_19,kw1_28,kw1_69,kw1_70,kw1_72,kw1_87,kw1_92,kw1_105,kw1_106,kw1_109,kw1_119,kw1_121,kw1_123,kw1_133,kw1_136,kw1_145,kw1_152,kw1_157,kw1_163,kw1_169,kw1_176,kw1_177,kw1_180,kw1_181,kw1_191,kw1_209,kw1_235,kw1_242,kw1_249,kw1_278,kw1_279,kw1_289,kw1_295,kw1_313,kw1_317,kw1_321,kw1_336,kw1_341,kw1_344,kw1_354,kw1_358,kw1_366,kw1_367,kw1_370,kw1_376,kw1_378,kw1_380,kw1_382,kw1_391,kw2_2,kw2_10,kw2_11,kw2_34,kw2_46,kw2_47,kw2_50,kw2_55,kw2_62,kw2_63,kw2_69,kw2_70,kw2_76,kw2_87,kw2_91,kw2_100,kw2_114,kw2_116,kw2_117,kw2_123,kw2_124,kw2_127,kw2_129,kw2_135,kw2_137,kw2_142,kw2_144,kw2_151,kw2_158,kw2_163,kw2_168,kw2_174,kw2_177,kw2_180,kw2_184,kw2_192,kw2_196,kw2_197,kw2_200,kw2_202,kw2_215,kw2_216,kw2_217,kw2_223,kw2_235,kw2_237,kw2_240,kw2_241,kw2_246,kw2_250,kw3_7,kw3_27,kw3_29,kw3_68,kw3_80,kw3_88,kw3_95,kw3_101,kw3_138,kw3_171,kw3_186,kw3_197,kw3_198,kw3_206,kw3_213,kw3_223,kw3_248,kw3_263,kw3_273,kw3_302,kw3_316,kw3_336,kw3_349,kw3_362,kw3_381,kw3_401,kw3_412,kw3_416,kw3_453,kw3_465,kw3_470,kw3_488,kw3_513,kw3_534,kw3_549,kw3_560,kw3_570,kw3_581,kw3_586,kw3_598,kw3_610,kw3_627,kw3_633,kw3_638,kw3_668,kw3_685,kw3_692,kw3_694,kw3_695,kw3_701,topic1_0,topic1_1,topic1_2,topic1_3,topic1_4,topic1_5,topic1_6,topic1_7,topic1_9,topic1_10,topic1_11,topic1_12,topic1_13,topic1_14,topic1_15,topic1_16,topic1_17,topic1_18,topic1_19,topic1_20,topic1_21,topic1_22,topic1_23,topic1_24,topic1_25,topic1_26,topic1_27,topic1_28,topic1_29,topic1_30,topic1_31,topic1_32,topic1_33,topic1_34,topic1_35,topic1_36,topic1_37,topic1_38,topic1_39,topic1_40,topic1_41,topic1_42,topic1_43,topic1_44,topic1_45,topic1_46,topic1_47,topic1_48,topic1_49,topic1_50,topic2_0,topic2_2,topic2_3,topic2_4,topic2_5,topic2_6,topic2_7,topic2_9,topic2_10,topic2_11,topic2_13,topic2_14,topic2_15,topic2_16,topic2_17,topic2_19,topic2_20,topic2_21,topic2_22,topic2_24,topic2_25,topic2_26,topic2_27,topic2_28,topic2_29,topic2_30,topic2_31,topic2_32,topic2_33,topic2_34,topic2_35,topic2_36,topic2_39,topic2_40,topic2_41,topic2_42,topic2_43,topic2_44,topic2_45,topic2_46,topic2_47,topic2_48,topic2_49,topic2_50,topic2_51,topic2_52,topic2_53,topic2_54,topic2_55,topic2_56,topic3_3,topic3_10,topic3_11,topic3_14,topic3_18,topic3_24,topic3_28,topic3_30,topic3_31,topic3_33,topic3_39,topic3_42,topic3_43,topic3_47,topic3_53,topic3_55,topic3_56,topic3_58,topic3_59,topic3_60,topic3_62,topic3_66,topic3_68,topic3_70,topic3_72,topic3_76,topic3_78,topic3_79,topic3_81,topic3_84,topic3_87,topic3_90,topic3_92,topic3_99,topic3_100,topic3_101,topic3_109,topic3_111,topic3_112,topic3_119,topic3_121,topic3_123,topic3_124,topic3_127,topic3_130,topic3_136,topic3_137,topic3_138,topic3_139,topic3_141,appIdInstall_1,appIdInstall_4,appIdInstall_6,appIdInstall_9,appIdInstall_10,appIdInstall_11,appIdInstall_12,appIdInstall_15,appIdInstall_16,appIdInstall_17,appIdInstall_19,appIdInstall_21,appIdInstall_23,appIdInstall_26,appIdInstall_27,appIdInstall_28,appIdInstall_29,appIdInstall_32,appIdInstall_34,appIdInstall_35,appIdInstall_39,appIdInstall_40,appIdInstall_41,appIdInstall_42,appIdInstall_43,appIdInstall_44,appIdInstall_45,appIdInstall_47,appIdInstall_48,appIdInstall_49,appIdInstall_51,appIdInstall_52,appIdInstall_55,appIdInstall_56,appIdInstall_57,appIdInstall_58,appIdInstall_60,appIdInstall_61,appIdInstall_62,appIdInstall_63,appIdInstall_65,appIdInstall_67,appIdInstall_68,appIdInstall_69,appIdInstall_70,appIdInstall_71,appIdInstall_73,appIdInstall_74,appIdInstall_76,appIdInstall_77,appIdAction_2,appIdAction_4,appIdAction_5,appIdAction_7,appIdAction_8,appIdAction_11,appIdAction_13,appIdAction_14,appIdAction_16,appIdAction_17,appIdAction_27,appIdAction_30,appIdAction_32,appIdAction_33,appIdAction_34,appIdAction_35,appIdAction_36,appIdAction_37,appIdAction_38,appIdAction_39,appIdAction_40,appIdAction_41,appIdAction_43,appIdAction_44,appIdAction_45,appIdAction_47,appIdAction_50,appIdAction_51,appIdAction_52,appIdAction_53,appIdAction_55,appIdAction_56,appIdAction_60,appIdAction_62,appIdAction_65,appIdAction_66,appIdAction_69,appIdAction_70,appIdAction_71,appIdAction_72,appIdAction_74,appIdAction_75,appIdAction_76,appIdAction_77,appIdAction_80,appIdAction_81,appIdAction_83,appIdAction_84,appIdAction_85,appIdAction_91,ct_0,ct_1,ct_2,ct_3,ct_4,os_0,os_1,os_2,carrier_0,carrier_1,carrier_2,carrier_3,house_1
JOIN 操作转移至Spark
import re, time f = 'userFeature.data' ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' def fw_s(f, s): with open(f, 'w') as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # feature_order_l 获取一级特征的顺序 # 原始数据 特征挖掘 # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序, f_feature = 'toknowit.txt' f_feature_ele_num = f_feature.replace('.', 'EleNum.') f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.', 'Complete.') # 原始数据 加工成生产数据 # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定 f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.') # <---以上生成的文件无参数,数据恒定 # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳 f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time()))) to_write_immutable_file = True # to_write_immutable_file = False if to_write_immutable_file: feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000 # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合 with open(f, 'r') as fr: for i in fr: l = i.replace('\n', '').split('|') feature_incomplete_rows_l_this = [] for ii in l: ll = ii.split(' ') k = ll[0] feature_incomplete_rows_l_this.append(k) if k == 'uid': continue if k not in feature_d: feature_d[k] = [] # order -->int for iii in ll[1:]: feature_d[k].append(int(iii)) feature_incomplete_rows_l.append(feature_incomplete_rows_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数 for k in feature_d: feature_d[k] = list(set(feature_d[k])) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] # subset TEST # break for k in feature_d: feature_d[k] = sorted(list(set(feature_d[k])), reverse=False) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] s = '\n'.join([','.join(l) for l in feature_incomplete_rows_l]) fw_s(f_feature_incomplete, s) feature_after_e_d = {} for l in feature_incomplete_rows_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) # 原始一级特征b feature_complete_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_complete_l) s = '\n'.join(feature_complete_l) fw_s(f_feature_complete, s) print(feature_complete_l) feature_d_ = {} for feature in feature_complete_l: if feature == 'uid': continue feature_d_[feature] = feature_d[feature] del feature_d feature_d = feature_d_ s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d]) fw_s(f_feature, s) s = '\n'.join(['{}\n{}'.format(k, len(feature_d[k])) for k in feature_d]) fw_s(f_feature_ele_num, s) # 原始数据持久化完毕<--- # --->对原始数据做加工,生成新数据 # 0级别特征,将原始一级特征中"kw1,kw2,"合并 feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in feature_complete_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) s = '\n'.join(feature_reduce_l) fw_s(f_reduce, s) relative_, absolute_ = 2 / 3, 50 sparse_num_drop_max = min( [absolute_, max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])]) s = '\n'.join( ['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d]) fw_s(f_output, s) # 特征的属性值 feature_extend_d = {} is_odd_line = True with open(f_output, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',') if is_odd_line == True: is_odd_line = False k = l[0] feature_extend_d[k] = [] else: is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: feature_extend_d[k].append(ii) else: feature_extend_d[k].append(0) feature_extend_l = [] feature_complete_l.pop(feature_complete_l.index('uid')) feature_extend_l = '|'.join( ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|') print(feature_extend_l) s = ','.join(feature_extend_l) fw_s(f_extend, s) # 生成缩小的数据集,测试spark join 广告特征做广播 ori_l, extend_l = [], [] with open('toknowitComplete.txt', 'r')as fr: ori_l = [i.replace('\n', '') for i in fr] feature_extend_l, f_feature_extend = [], f_extend with open(f_feature_extend, 'r')as fr: extend_l = [i.replace('\n', '').split(',') for i in fr][0] ori_extend_d = {} for ori in ori_l: for extend_ in extend_l: if ori in extend_: if ori not in ori_extend_d: ori_extend_d[ori] = {} extend_d = {extend_: 0} ori_extend_d[ori][extend_] = 0 import copy ori_extend_d_ = copy.deepcopy(ori_extend_d) for i in ori_extend_d_['age']: if 'marriageStatus' in i: del ori_extend_d['age'][i] del ori_extend_d_ ''' 1-生成数据元结构,末端值全为0 2-拿到每行数据,去更新末端值 ''' c_ = 0 rows_d_l = [] with open(f, 'r') as fr: for i in fr: # c_ += 1 # if c_ == 6: # break ori_row_l = i.replace('\n', '').split('|') ori_extend_d_this = copy.deepcopy(ori_extend_d) uid_d = {} for ii in ori_row_l: l = ii.split(' ') print(l) feature_ori, val_l = l[0], l[1:] if feature_ori == 'uid': uid = val_l[0] continue if len(ori_extend_d[feature_ori]) == 1: for feature_sub in ori_extend_d_this[feature_ori]: print(feature_sub) ori_extend_d_this[feature_ori][feature_sub] = 1 if int(val_l[0]) > 0 else 0 else: for val_ in val_l: feature_sub = '{}_{}'.format(feature_ori, val_) print(feature_sub) if feature_sub in ori_extend_d_this[feature_ori]: ###多余的校验 ori_extend_d_this[feature_ori][feature_sub] = 1 # ???? uid_d[uid] = ori_extend_d_this del ori_extend_d_this rows_d_l.append(uid_d) del uid_d s_l = [] f_userdata_extend = f.replace('.data', '{}.data'.format(int(time.time()))) for d in rows_d_l: for uid in d: c_ += 1 l = [] d_ = d[uid] for feature_ in d_: for feature_sub in d_[feature_]: l.append(d_[feature_][feature_sub]) s = '{},{}'.format(uid, ','.join([str(i) for i in l])) s_l.append(s) fw_s(f_userdata_extend, '\n'.join(s_l)) print(c_) ''' gen JOIN data FOR DNN ''' f_user = 'userFeature.data' f_ad = 'adFeature.csv' f_user_extend = f_userdata_extend f_train = 'train.csv' f_test = 'test2.csv' ''' gen head ''' csv_head = 'advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType' f_toknowitExtend = 'toknowitExtend1527038949.txt' try: with open(f_toknowitExtend, 'r') as fr: for i in fr: csv_head = 'label,{},{}'.format(i.replace('\n', ''), csv_head) print(csv_head) except Exception as e: print(e) csv_head = '' # no file ''' get dict ''' ad_d = {} with open(f_ad, 'r') as fr: for i in fr: if 'aid' in i: continue l = i.replace('\n', '').split(',') aid = l[0] ad_d[aid] = ','.join(l[1:]) uid_d = {} with open(f_user_extend, 'r') as fr: for i in fr: if 'aid' in i: continue l = i.replace('\n', '').split(',') uid = l[0] uid_d[uid] = ','.join(l[1:]) ''' gen train data ''' dnn_csvTRAIN = 'dnn_csvTRAIN{}.csv'.format(int(time.time())) with open(dnn_csvTRAIN, 'w') as fa: fa.write(csv_head) with open(f_train, 'r') as fr: for i in fr: if 'aid' in i: continue try: l = i.replace('\n', '').replace(' ', '').split(',') print(l) aid, uid, label = l s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid]) with open(dnn_csvTRAIN, 'a') as fa: fa.write(s) except Exception as e: print(e) ''' gen test data ''' dnn_csvTEST = 'dnn_csvTEST{}.csv'.format(int(time.time())) with open(dnn_csvTEST, 'w') as fa: fa.write(csv_head) with open(f_test, 'r') as fr: for i in fr: if 'aid' in i: continue try: break l = i.replace('\n', '').replace(' ', '').split(',') print(l) # aid, uid, label = l aid, uid = l label = 0 s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid]) with open(dnn_csvTEST, 'a') as fa: fa.write(s) except Exception as e: print(e) dd = 9