特征列 属性值 获取 vowpal wabbit 生成DNN 的训练测试数据
用户特征文件 userFeature.data 每 行 代 表 一 个 用 户 的 特 征 数 据, 格 式 为: “uid|features”,uid 和 features 用竖线“|”分隔。其中 feature 采用 vowpal wabbit(https://github.com/JohnLangford/vowpal_wabbit)格式: “feature_group1|feature_group2|feature_group3|...”。 每 个 feature_group 代表一个特征组,多个特征组之间也以竖线“|”分隔。一个特征组若包括多个值 则以空格分隔,格式为:“feature_group_name fea_name1 fea_name2 …”, 其中 fea_name 采用数据编号的格式。
特征列 属性值 获取
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | d = {} with open(f, 'r' ) as fr: for i in fr: l = i.split( '|' ) for ii in l: ll=ii.split( ' ' ) k=ll[0] if k not in d: d[k]=[] for iii in ll[1:]: iii_= int (iii) if int (iii) not in d[k]: d[k].append(iii_) for k in d: l=sorted(d[k],reverse=False) print(k) print(l) |
批处理 减小运行时间
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | d, reduce_chk_counter, reduce_chk_step = {}, 0 , 500000 with open (f, 'r' ) as fr: for i in fr: l = i.split( '|' ) for ii in l: ll = ii.split( ' ' ) k = ll[ 0 ] if k not in d: d[k] = [] for iii in ll[ 1 :]: iii_ = int (iii) d[k].append(iii_) reduce_chk_counter + = 1 if reduce_chk_counter = = reduce_chk_step: reduce_chk_counter = 0 for k in d: d[k] = list ( set (d[k])) l = sorted (d[k], reverse = False ) print (k) print (l) for k in d: d[k] = list ( set (d[k])) l = sorted (d[k], reverse = False ) print (k) print (l) res_f = 'toknowit.txt' with open (res_f, 'w' ) as fw: for k in d: l = sorted (d[k], reverse = False ) print (k) print (l) s = '{}\n{}\n' . format (k, ',' .join(l)) fw.write(s) |
将数据冗余至内存,批处理,去重,释放内存
f = 'userFeature.data' # 确定marriageStatus特征列的值 d, d1 = {}, {} with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: if 'marriageStatus' in ii: k = len(ii) d[k] = ii k = ii.split('marriageStatus')[-1] d1[k] = ii for k in d: print(k, d[k]) for k in d1: print(k, d1[k]) ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' d = {} with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: ll = ii.split(' ') k = ll[0] if k not in d: d[k] = [] for iii in ll[1:]: iii_ = int(iii) if int(iii) not in d[k]: d[k].append(iii_) d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000 with open(f, 'r') as fr: for i in fr: break l = i.split('|') for ii in l: ll = ii.split(' ') k = ll[0] if k == 'uid': continue if k not in d: d[k] = [] for iii in ll[1:]: iii_ = int(iii) d[k].append(iii_) reduce_chk_counter += 1 if reduce_chk_counter == reduce_chk_step: reduce_chk_counter = 0 for k in d: d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) for k in d: break d[k] = list(set(d[k])) l = sorted(d[k], reverse=False) print(k) print(l) res_f = 'toknowit.txt' # with open(res_f, 'w') as fw: # for k in d: # l = sorted(d[k], reverse=False) # print(k) # print(l) # s = '{}\n{}\n'.format(k, ','.join([str(i) for i in l])) # fw.write(s) cut_l = [] with open(res_f, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',')[0:200] cut_l.append(l) res_f_cut = 'toknowitCUT.txt' with open(res_f_cut, 'w') as fw: for l in cut_l: s = '{}\n'.format(','.join([str(i) for i in l])) fw.write(s)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | age 0 , 1 , 2 , 3 , 4 , 5 gender 0 , 1 , 2 marriageStatus 0 , 2 , 3 , 5 , 6 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 education 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 consumptionAbility 0 , 1 , 2 LBS 0 , 1 , 2 , 3 , 4 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 18 , 19 , 20 , 21 , 23 , 25 , 26 , 27 , 29 , 30 , 31 , 32 , 33 , 35 , 36 , 38 , 39 , 40 , 41 , 42 , 43 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 91 , 92 , 94 , 95 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 135 , 137 , 138 , 139 , 142 , 143 , 144 , 145 , 146 , 147 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 157 , 158 , 159 , 160 , 161 , 163 , 165 , 168 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 183 , 184 , 185 , 186 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 197 , 198 , 199 , 200 , 201 , 202 , 203 , 204 , 206 , 208 , 209 , 210 , 211 , 212 , 214 , 215 , 216 , 217 , 218 , 219 , 220 , 222 , 223 , 224 , 225 , 227 , 229 , 232 , 233 , 234 , 235 , 236 interest1 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 interest2 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 78 , 79 , 80 , 81 , 82 interest5 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 kw1 2 , 3 , 8 , 13 , 17 , 19 , 21 , 28 , 29 , 39 , 41 , 42 , 43 , 46 , 56 , 59 , 65 , 68 , 69 , 70 , 71 , 72 , 74 , 86 , 87 , 88 , 90 , 92 , 95 , 100 , 101 , 105 , 106 , 109 , 111 , 112 , 113 , 119 , 121 , 123 , 125 , 131 , 133 , 136 , 139 , 141 , 142 , 143 , 145 , 150 , 152 , 156 , 157 , 162 , 163 , 166 , 169 , 172 , 173 , 174 , 176 , 177 , 180 , 181 , 183 , 184 , 185 , 186 , 191 , 199 , 203 , 204 , 209 , 211 , 214 , 216 , 230 , 235 , 240 , 242 , 243 , 246 , 249 , 260 , 263 , 265 , 268 , 269 , 271 , 272 , 278 , 279 , 283 , 284 , 289 , 291 , 292 , 295 , 302 , 303 , 304 , 307 , 313 , 317 , 321 , 322 , 323 , 331 , 336 , 341 , 343 , 344 , 351 , 354 , 357 , 358 , 359 , 366 , 367 , 369 , 370 , 372 , 373 , 375 , 376 , 377 , 378 , 380 , 381 , 382 , 390 , 391 , 393 , 396 , 401 , 402 , 406 , 407 , 408 , 409 , 411 , 414 , 417 , 423 , 429 , 433 , 434 , 437 , 438 , 441 , 442 , 443 , 449 , 456 , 464 , 465 , 468 , 472 , 473 , 475 , 477 , 478 , 480 , 482 , 485 , 486 , 487 , 495 , 496 , 497 , 504 , 506 , 507 , 511 , 513 , 521 , 522 , 526 , 532 , 536 , 541 , 542 , 546 , 547 , 560 , 561 , 563 , 566 , 567 , 575 , 576 , 578 , 581 , 584 , 588 , 592 , 594 , 604 , 605 , 610 kw2 2 , 6 , 7 , 9 , 10 , 11 , 12 , 14 , 21 , 22 , 23 , 25 , 26 , 30 , 34 , 38 , 40 , 41 , 42 , 43 , 44 , 46 , 47 , 50 , 55 , 56 , 62 , 63 , 66 , 69 , 70 , 71 , 72 , 74 , 75 , 76 , 77 , 78 , 80 , 81 , 84 , 85 , 87 , 89 , 90 , 91 , 94 , 95 , 100 , 112 , 114 , 116 , 117 , 118 , 119 , 121 , 123 , 124 , 127 , 128 , 129 , 130 , 133 , 135 , 137 , 142 , 143 , 144 , 148 , 149 , 151 , 153 , 154 , 156 , 157 , 158 , 163 , 168 , 171 , 174 , 176 , 177 , 180 , 183 , 184 , 186 , 192 , 193 , 195 , 196 , 197 , 200 , 202 , 203 , 215 , 216 , 217 , 219 , 221 , 223 , 228 , 229 , 235 , 237 , 238 , 240 , 241 , 246 , 248 , 250 , 255 , 258 , 260 , 263 , 266 , 269 , 272 , 275 , 276 , 278 , 280 , 286 , 287 , 290 , 294 , 295 , 296 , 297 , 301 , 302 , 303 , 305 , 313 , 317 , 321 , 323 , 327 , 330 , 333 , 334 , 338 , 339 , 340 , 341 , 343 , 344 , 345 , 347 , 354 , 358 , 359 , 363 , 366 , 368 , 369 , 371 , 374 , 375 , 377 , 378 , 380 , 383 , 384 , 386 , 391 , 393 , 394 , 395 , 398 , 399 , 400 , 403 , 404 , 405 , 408 , 409 , 412 , 413 , 417 , 418 , 422 , 427 , 433 , 436 , 438 , 440 , 442 , 445 , 447 , 448 , 451 , 453 , 454 , 455 , 456 , 457 , 459 , 461 , 462 , 463 topic1 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 topic2 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 ct 0 , 1 , 2 , 3 , 4 os 0 , 1 , 2 carrier 0 , 1 , 2 , 3 house 1 interest3 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 kw3 1 , 7 , 8 , 10 , 15 , 19 , 25 , 27 , 29 , 36 , 50 , 56 , 63 , 68 , 69 , 74 , 77 , 80 , 88 , 93 , 95 , 101 , 117 , 122 , 123 , 124 , 126 , 132 , 133 , 136 , 138 , 149 , 151 , 152 , 153 , 155 , 157 , 164 , 167 , 171 , 173 , 174 , 181 , 186 , 188 , 190 , 192 , 194 , 197 , 198 , 206 , 209 , 213 , 223 , 228 , 233 , 235 , 248 , 249 , 253 , 263 , 273 , 276 , 278 , 280 , 286 , 288 , 295 , 302 , 303 , 311 , 314 , 316 , 323 , 328 , 331 , 332 , 333 , 336 , 343 , 349 , 362 , 364 , 366 , 370 , 372 , 381 , 385 , 391 , 394 , 399 , 401 , 404 , 411 , 412 , 416 , 420 , 425 , 427 , 431 , 453 , 459 , 464 , 465 , 469 , 470 , 474 , 488 , 499 , 504 , 505 , 508 , 512 , 513 , 523 , 530 , 531 , 534 , 539 , 549 , 559 , 560 , 563 , 566 , 568 , 570 , 574 , 581 , 586 , 588 , 598 , 607 , 610 , 617 , 627 , 630 , 633 , 634 , 635 , 636 , 638 , 645 , 650 , 654 , 655 , 657 , 663 , 668 , 676 , 677 , 681 , 685 , 686 , 687 , 691 , 692 , 694 , 695 , 696 , 699 , 701 , 703 , 705 , 707 , 709 , 719 , 722 , 723 , 725 , 734 , 735 , 737 , 739 , 740 , 742 , 745 , 751 , 755 , 763 , 764 , 769 , 771 , 780 , 785 , 788 , 799 , 800 , 805 , 809 , 818 , 821 , 833 , 835 , 836 , 840 , 851 , 853 , 856 , 860 , 862 topic3 1 , 3 , 4 , 8 , 10 , 11 , 14 , 16 , 18 , 19 , 21 , 22 , 23 , 24 , 25 , 27 , 28 , 30 , 31 , 32 , 33 , 34 , 35 , 37 , 39 , 42 , 43 , 44 , 46 , 47 , 49 , 51 , 53 , 54 , 55 , 56 , 58 , 59 , 60 , 62 , 63 , 65 , 66 , 68 , 69 , 70 , 72 , 75 , 76 , 78 , 79 , 81 , 84 , 87 , 88 , 90 , 92 , 95 , 98 , 99 , 100 , 101 , 102 , 103 , 107 , 108 , 109 , 111 , 112 , 113 , 115 , 116 , 117 , 119 , 120 , 121 , 123 , 124 , 126 , 127 , 129 , 130 , 132 , 133 , 136 , 137 , 138 , 139 , 141 , 142 , 143 , 146 , 148 , 150 , 151 , 154 , 157 , 158 , 159 , 161 , 162 , 164 , 165 , 166 , 167 , 168 , 169 , 171 , 174 , 176 , 177 , 178 , 180 , 182 , 183 , 185 , 186 , 187 , 188 , 190 , 191 , 192 , 193 , 194 , 197 , 198 , 199 , 201 , 202 , 205 , 206 , 207 , 209 , 210 , 211 , 212 , 213 , 214 , 215 , 216 , 217 , 218 , 219 , 220 , 221 , 223 , 226 , 227 , 228 , 232 , 233 , 234 , 235 , 237 , 238 , 240 , 241 , 243 , 251 , 252 , 253 , 255 , 256 , 258 , 259 , 260 , 262 , 264 , 265 , 266 , 267 , 268 , 269 , 270 , 271 , 272 , 273 , 274 , 275 , 278 , 279 , 280 , 282 , 283 , 285 , 287 , 288 , 292 , 297 , 298 , 299 , 301 , 304 , 305 , 306 , 307 , 308 , 309 , 312 , 314 interest4 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 appIdAction 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 , 200 appIdInstall 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 , 200 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | res_f_cut = 'toknowitCUT.txt' # with open(res_f_cut, 'w') as fw: # s = '\n'.join([','.join([str(i) for i in l]) for l in cut_l]) # fw.write(s) sparse_num_drop_max, is_odd_line = 50 , True with open (res_f_cut, 'r' ) as fr: for i in fr: l = i.replace( '\n' , ' ').split(' ,') if is_odd_line = = True : is_odd_line = False k = l[ 0 ] else : is_odd_line = True if len (l) < = sparse_num_drop_max: for ii in l: s = '{}_{}' . format (k, ii) print (s) else : print (k) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | age_0 age_1 age_2 age_3 age_4 age_5 gender_0 gender_1 gender_2 marriageStatus_0 marriageStatus_2 marriageStatus_3 marriageStatus_5 marriageStatus_6 marriageStatus_8 marriageStatus_9 marriageStatus_10 marriageStatus_11 marriageStatus_12 marriageStatus_13 marriageStatus_14 marriageStatus_15 education_0 education_1 education_2 education_3 education_4 education_5 education_6 education_7 consumptionAbility_0 consumptionAbility_1 consumptionAbility_2 LBS interest1 interest2 interest5 kw1 kw2 topic1 topic2 ct_0 ct_1 ct_2 ct_3 ct_4 os_0 os_1 os_2 carrier_0 carrier_1 carrier_2 carrier_3 house_1 interest3_1 interest3_2 interest3_3 interest3_4 interest3_5 interest3_6 interest3_7 interest3_8 interest3_9 interest3_10 kw3 topic3 interest4_1 interest4_2 interest4_3 interest4_4 interest4_5 interest4_6 interest4_7 interest4_8 interest4_9 interest4_10 appIdAction appIdInstall |
1 2 3 | [ 'uid' , 'age' , 'gender' , 'marriageStatus' , 'education' , 'consumptionAbility' , 'LBS' , 'interest1' , 'interest2' , 'interest3' , 'interest4' , 'interest5' , 'kw1' , 'kw2' , 'kw3' , 'topic1' , 'topic2' , 'topic3' , 'appIdInstall' , 'appIdAction' , 'ct' , 'os' , 'carrier' , 'house' ] [ 'age_0' , 'age_1' , 'age_2' , 'age_3' , 'age_4' , 'age_5' , 'gender_0' , 'gender_1' , 'gender_2' , 'marriageStatus_0' , 'marriageStatus_2' , 'marriageStatus_3' , 'marriageStatus_5' , 'marriageStatus_6' , 'marriageStatus_8' , 'marriageStatus_9' , 'marriageStatus_10' , 'marriageStatus_11' , 'marriageStatus_12' , 'marriageStatus_13' , 'marriageStatus_14' , 'marriageStatus_15' , 'education_0' , 'education_1' , 'education_2' , 'education_3' , 'education_4' , 'education_5' , 'education_6' , 'education_7' , 'consumptionAbility_0' , 'consumptionAbility_1' , 'consumptionAbility_2' , 'LBS_0' , 'interest1_0' , 'interest2_0' , 'interest3_1' , 'interest3_2' , 'interest3_3' , 'interest3_4' , 'interest3_5' , 'interest3_6' , 'interest3_7' , 'interest3_8' , 'interest3_9' , 'interest3_10' , 'interest4_1' , 'interest4_2' , 'interest4_3' , 'interest4_4' , 'interest4_5' , 'interest4_6' , 'interest4_7' , 'interest4_8' , 'interest4_9' , 'interest4_10' , 'interest5_0' , 'kw1_0' , 'kw2_0' , 'kw3_0' , 'topic1_0' , 'topic2_0' , 'topic3_0' , 'appIdInstall_0' , 'appIdAction_0' , 'ct_0' , 'ct_1' , 'ct_2' , 'ct_3' , 'ct_4' , 'os_0' , 'os_1' , 'os_2' , 'carrier_0' , 'carrier_1' , 'carrier_2' , 'carrier_3' , 'house_1' ] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | f = 'userFeature.data' '' ' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 '' ' def fw_s(f, s): with open(f, 'w' ) as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # k_oreder_l 获取一级特征的顺序 res_f = 'toknowit.txt' res_f_k_order = res_f.replace( '.' , 'KeyOrder.' ) res_f_cut, children_val_max = res_f.replace( '.' , 'Cut.' ), 50 to_write = True to_write = False if to_write: d, reduce_chk_counter, reduce_chk_step, k_oreder_l = {}, 0, 500000, [] with open(f, 'r' ) as fr: for i in fr: l = i.replace( '\n' , '' ).split( '|' ) k_order_l_this = [] for ii in l: ll = ii.split( ' ' ) k = ll[0] k_order_l_this.append(k) if k == 'uid' : continue if k not in d: d[k] = [] # order -->int for iii in ll[1:]: d[k].append( int (iii)) k_oreder_l.append(k_order_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 for k in d: d[k] = list(set(d[k])) k_oreder_l = [e for i, e in enumerate(k_oreder_l) if k_oreder_l.index(e) == i] # set() TypeError unhashable type: 'list' for k in d: d[k] = sorted(list(set(d[k])), reverse=False) k_oreder_l = [e for i, e in enumerate(k_oreder_l) if k_oreder_l.index(e) == i] s = '\n' .join([ '{}\n{}' .format(k, ',' .join([str(i) for i in d[k]])) for k in d]) fw_s(res_f, s) s = '\n' .join([ '{}\n{}' .format(k, ',' .join([str(i) for i in d[k][0:children_val_max]])) for k in d]) fw_s(res_f_cut, s) s = '\n' .join([ '|' .join(l) for l in k_oreder_l]) fw_s(res_f_k_order, s) with open(res_f_k_order, 'r' ) as fr: ori_feature_l = [i.replace( '\n' , '' ).split( '|' ) for i in fr] feature_after_e_d = {} for l in ori_feature_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) feature_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_l) import re feature_reduce_l = [i if re.search( '\d' , i) is None else i[0:re.search( '\d' , i).endpos - 1] for i in feature_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) sparse_num_drop_max, is_odd_line = 20, True # 特征的属性值 res_d = {} with open(res_f_cut, 'r' ) as fr: for i in fr: l = i.replace( '\n' , '' ).split( ',' ) if is_odd_line == True: is_odd_line = False k = l[0] res_d[k] = [] else : is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: res_d[k].append(ii) else : res_d[k].append(0) feature_expand_l = [] feature_l.pop(feature_l.index( 'uid' )) for k in feature_l: feature_expand_l += [ '{}_{}' .format(k, i) for i in res_d[k]] print(feature_expand_l) dd = 5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | import re, time f = 'userFeature.data' '' ' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 '' ' def fw_s(f, s): with open(f, 'w' ) as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # feature_order_l 获取一级特征的顺序 # 原始数据 特征挖掘 # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序, f_feature = 'toknowit.txt' f_feature_ele_num = f_feature.replace( '.' , 'EleNum.' ) f_feature_incomplete, f_feature_complete = f_feature.replace( '.' , 'Incomplete.' ), f_feature.replace( '.' , 'Complete.' ) # 原始数据 加工成生产数据 # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定 f_reduce, f_output = f_feature.replace( '.' , 'Reduce.' ), f_feature.replace( '.' , 'Output.' ) # <---以上生成的文件无参数,数据恒定 # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳 f_extend = f_feature.replace( '.' , 'Extend{}.' .format( int ( time . time ()))) to_write_immutable_file = True # to_write_immutable_file = False if to_write_immutable_file: feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000 # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合 with open(f, 'r' ) as fr: for i in fr: l = i.replace( '\n' , '' ).split( '|' ) feature_incomplete_rows_l_this = [] for ii in l: ll = ii.split( ' ' ) k = ll[0] feature_incomplete_rows_l_this.append(k) if k == 'uid' : continue if k not in feature_d: feature_d[k] = [] # order -->int for iii in ll[1:]: feature_d[k].append( int (iii)) feature_incomplete_rows_l.append(feature_incomplete_rows_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数 for k in feature_d: feature_d[k] = list(set(feature_d[k])) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] # subset TEST break for k in feature_d: feature_d[k] = sorted(list(set(feature_d[k])), reverse=False) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] s = '\n' .join([ ',' .join(l) for l in feature_incomplete_rows_l]) fw_s(f_feature_incomplete, s) feature_after_e_d = {} for l in feature_incomplete_rows_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) # 原始一级特征b feature_complete_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_complete_l) s = '\n' .join(feature_complete_l) fw_s(f_feature_complete, s) print(feature_complete_l) feature_d_ = {} for feature in feature_complete_l: if feature == 'uid' : continue feature_d_[feature] = feature_d[feature] del feature_d feature_d = feature_d_ s = '\n' .join([ '{}\n{}' .format(k, ',' .join([str(i) for i in feature_d[k]])) for k in feature_d]) fw_s(f_feature, s) s = '\n' .join([ '{}\n{}' .format(k, len(feature_d[k])) for k in feature_d]) fw_s(f_feature_ele_num, s) # 原始数据持久化完毕<--- # --->对原始数据做加工,生成新数据 # 0级别特征,将原始一级特征中"kw1,kw2,"合并 feature_reduce_l = [i if re.search( '\d' , i) is None else i[0:re.search( '\d' , i).endpos - 1] for i in feature_complete_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) s = '\n' .join(feature_reduce_l) fw_s(f_reduce, s) relative_, absolute_ = 2 / 3, 50 sparse_num_drop_max = min( [absolute_, max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0: int (len(feature_d) * relative_)])]) s = '\n' .join( [ '{}\n{}' .format(k, ',' .join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d]) fw_s(f_output, s) # 特征的属性值 feature_extend_d = {} is_odd_line = True with open(f_output, 'r' ) as fr: for i in fr: l = i.replace( '\n' , '' ).split( ',' ) if is_odd_line == True: is_odd_line = False k = l[0] feature_extend_d[k] = [] else : is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: feature_extend_d[k].append(ii) else : feature_extend_d[k].append(0) feature_extend_l = [] feature_complete_l.pop(feature_complete_l.index( 'uid' )) feature_extend_l = '|' .join( [ '|' .join([ '{}_{}' .format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split( '|' ) print(feature_extend_l) s = ',' .join(feature_extend_l) fw_s(f_extend, s) # 生成缩小的数据集,测试spark join 广告特征做广播 # feature_extend_l, f_extend = [], 'toknowitExtend1526836898.txt' # # with open(f_extend, 'r') as fr: # feature_extend_l = [i.replace('\n', '') for i in fr][0].split(',') d = 8 |
1 | age_0,age_1,age_2,age_3,age_4,age_5,gender_0,gender_1,gender_2,marriageStatus_0,marriageStatus_2,marriageStatus_3,marriageStatus_5,marriageStatus_6,marriageStatus_9,marriageStatus_10,marriageStatus_11,marriageStatus_12,marriageStatus_13,marriageStatus_14,marriageStatus_15,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,consumptionAbility_0,consumptionAbility_1,consumptionAbility_2,LBS_0,LBS_1,LBS_4,LBS_6,LBS_7,LBS_8,LBS_9,LBS_13,LBS_14,LBS_15,LBS_16,LBS_18,LBS_19,LBS_21,LBS_25,LBS_27,LBS_29,LBS_32,LBS_33,LBS_35,LBS_38,LBS_39,LBS_41,LBS_43,LBS_45,LBS_46,LBS_47,LBS_48,LBS_49,LBS_52,LBS_54,LBS_56,LBS_57,LBS_61,LBS_62,LBS_63,LBS_64,LBS_66,LBS_69,LBS_71,LBS_72,LBS_73,LBS_75,LBS_77,LBS_78,LBS_81,LBS_83,LBS_84,LBS_85,LBS_86,interest1_1,interest1_2,interest1_3,interest1_4,interest1_5,interest1_6,interest1_7,interest1_8,interest1_9,interest1_10,interest1_11,interest1_12,interest1_13,interest1_14,interest1_15,interest1_16,interest1_17,interest1_18,interest1_19,interest1_20,interest1_21,interest1_22,interest1_23,interest1_24,interest1_25,interest1_26,interest1_27,interest1_28,interest1_29,interest1_30,interest1_31,interest1_32,interest1_33,interest1_34,interest1_35,interest1_36,interest1_37,interest1_38,interest1_39,interest1_40,interest1_41,interest1_42,interest1_43,interest1_44,interest1_45,interest1_46,interest1_47,interest1_48,interest1_49,interest1_50,interest2_1,interest2_2,interest2_3,interest2_4,interest2_5,interest2_6,interest2_7,interest2_8,interest2_9,interest2_10,interest2_11,interest2_12,interest2_13,interest2_14,interest2_15,interest2_16,interest2_17,interest2_18,interest2_19,interest2_20,interest2_21,interest2_22,interest2_23,interest2_24,interest2_25,interest2_26,interest2_27,interest2_28,interest2_29,interest2_30,interest2_31,interest2_32,interest2_33,interest2_35,interest2_36,interest2_37,interest2_38,interest2_39,interest2_40,interest2_41,interest2_42,interest2_43,interest2_44,interest2_45,interest2_46,interest2_47,interest2_48,interest2_49,interest2_50,interest2_51,interest3_1,interest3_2,interest3_3,interest3_4,interest3_5,interest3_6,interest3_7,interest3_8,interest3_9,interest3_10,interest4_1,interest4_2,interest4_3,interest4_4,interest4_5,interest4_6,interest4_7,interest4_8,interest4_9,interest4_10,interest5_1,interest5_2,interest5_3,interest5_4,interest5_5,interest5_6,interest5_7,interest5_8,interest5_9,interest5_10,interest5_11,interest5_12,interest5_13,interest5_14,interest5_15,interest5_16,interest5_17,interest5_18,interest5_19,interest5_20,interest5_21,interest5_22,interest5_23,interest5_24,interest5_25,interest5_26,interest5_27,interest5_28,interest5_29,interest5_30,interest5_31,interest5_32,interest5_33,interest5_34,interest5_35,interest5_36,interest5_37,interest5_38,interest5_39,interest5_40,interest5_41,interest5_42,interest5_43,interest5_44,interest5_45,interest5_46,interest5_47,interest5_48,interest5_49,interest5_50,kw1_13,kw1_19,kw1_28,kw1_69,kw1_70,kw1_72,kw1_87,kw1_92,kw1_105,kw1_106,kw1_109,kw1_119,kw1_121,kw1_123,kw1_133,kw1_136,kw1_145,kw1_152,kw1_157,kw1_163,kw1_169,kw1_176,kw1_177,kw1_180,kw1_181,kw1_191,kw1_209,kw1_235,kw1_242,kw1_249,kw1_278,kw1_279,kw1_289,kw1_295,kw1_313,kw1_317,kw1_321,kw1_336,kw1_341,kw1_344,kw1_354,kw1_358,kw1_366,kw1_367,kw1_370,kw1_376,kw1_378,kw1_380,kw1_382,kw1_391,kw2_2,kw2_10,kw2_11,kw2_34,kw2_46,kw2_47,kw2_50,kw2_55,kw2_62,kw2_63,kw2_69,kw2_70,kw2_76,kw2_87,kw2_91,kw2_100,kw2_114,kw2_116,kw2_117,kw2_123,kw2_124,kw2_127,kw2_129,kw2_135,kw2_137,kw2_142,kw2_144,kw2_151,kw2_158,kw2_163,kw2_168,kw2_174,kw2_177,kw2_180,kw2_184,kw2_192,kw2_196,kw2_197,kw2_200,kw2_202,kw2_215,kw2_216,kw2_217,kw2_223,kw2_235,kw2_237,kw2_240,kw2_241,kw2_246,kw2_250,kw3_7,kw3_27,kw3_29,kw3_68,kw3_80,kw3_88,kw3_95,kw3_101,kw3_138,kw3_171,kw3_186,kw3_197,kw3_198,kw3_206,kw3_213,kw3_223,kw3_248,kw3_263,kw3_273,kw3_302,kw3_316,kw3_336,kw3_349,kw3_362,kw3_381,kw3_401,kw3_412,kw3_416,kw3_453,kw3_465,kw3_470,kw3_488,kw3_513,kw3_534,kw3_549,kw3_560,kw3_570,kw3_581,kw3_586,kw3_598,kw3_610,kw3_627,kw3_633,kw3_638,kw3_668,kw3_685,kw3_692,kw3_694,kw3_695,kw3_701,topic1_0,topic1_1,topic1_2,topic1_3,topic1_4,topic1_5,topic1_6,topic1_7,topic1_9,topic1_10,topic1_11,topic1_12,topic1_13,topic1_14,topic1_15,topic1_16,topic1_17,topic1_18,topic1_19,topic1_20,topic1_21,topic1_22,topic1_23,topic1_24,topic1_25,topic1_26,topic1_27,topic1_28,topic1_29,topic1_30,topic1_31,topic1_32,topic1_33,topic1_34,topic1_35,topic1_36,topic1_37,topic1_38,topic1_39,topic1_40,topic1_41,topic1_42,topic1_43,topic1_44,topic1_45,topic1_46,topic1_47,topic1_48,topic1_49,topic1_50,topic2_0,topic2_2,topic2_3,topic2_4,topic2_5,topic2_6,topic2_7,topic2_9,topic2_10,topic2_11,topic2_13,topic2_14,topic2_15,topic2_16,topic2_17,topic2_19,topic2_20,topic2_21,topic2_22,topic2_24,topic2_25,topic2_26,topic2_27,topic2_28,topic2_29,topic2_30,topic2_31,topic2_32,topic2_33,topic2_34,topic2_35,topic2_36,topic2_39,topic2_40,topic2_41,topic2_42,topic2_43,topic2_44,topic2_45,topic2_46,topic2_47,topic2_48,topic2_49,topic2_50,topic2_51,topic2_52,topic2_53,topic2_54,topic2_55,topic2_56,topic3_3,topic3_10,topic3_11,topic3_14,topic3_18,topic3_24,topic3_28,topic3_30,topic3_31,topic3_33,topic3_39,topic3_42,topic3_43,topic3_47,topic3_53,topic3_55,topic3_56,topic3_58,topic3_59,topic3_60,topic3_62,topic3_66,topic3_68,topic3_70,topic3_72,topic3_76,topic3_78,topic3_79,topic3_81,topic3_84,topic3_87,topic3_90,topic3_92,topic3_99,topic3_100,topic3_101,topic3_109,topic3_111,topic3_112,topic3_119,topic3_121,topic3_123,topic3_124,topic3_127,topic3_130,topic3_136,topic3_137,topic3_138,topic3_139,topic3_141,appIdInstall_1,appIdInstall_4,appIdInstall_6,appIdInstall_9,appIdInstall_10,appIdInstall_11,appIdInstall_12,appIdInstall_15,appIdInstall_16,appIdInstall_17,appIdInstall_19,appIdInstall_21,appIdInstall_23,appIdInstall_26,appIdInstall_27,appIdInstall_28,appIdInstall_29,appIdInstall_32,appIdInstall_34,appIdInstall_35,appIdInstall_39,appIdInstall_40,appIdInstall_41,appIdInstall_42,appIdInstall_43,appIdInstall_44,appIdInstall_45,appIdInstall_47,appIdInstall_48,appIdInstall_49,appIdInstall_51,appIdInstall_52,appIdInstall_55,appIdInstall_56,appIdInstall_57,appIdInstall_58,appIdInstall_60,appIdInstall_61,appIdInstall_62,appIdInstall_63,appIdInstall_65,appIdInstall_67,appIdInstall_68,appIdInstall_69,appIdInstall_70,appIdInstall_71,appIdInstall_73,appIdInstall_74,appIdInstall_76,appIdInstall_77,appIdAction_2,appIdAction_4,appIdAction_5,appIdAction_7,appIdAction_8,appIdAction_11,appIdAction_13,appIdAction_14,appIdAction_16,appIdAction_17,appIdAction_27,appIdAction_30,appIdAction_32,appIdAction_33,appIdAction_34,appIdAction_35,appIdAction_36,appIdAction_37,appIdAction_38,appIdAction_39,appIdAction_40,appIdAction_41,appIdAction_43,appIdAction_44,appIdAction_45,appIdAction_47,appIdAction_50,appIdAction_51,appIdAction_52,appIdAction_53,appIdAction_55,appIdAction_56,appIdAction_60,appIdAction_62,appIdAction_65,appIdAction_66,appIdAction_69,appIdAction_70,appIdAction_71,appIdAction_72,appIdAction_74,appIdAction_75,appIdAction_76,appIdAction_77,appIdAction_80,appIdAction_81,appIdAction_83,appIdAction_84,appIdAction_85,appIdAction_91,ct_0,ct_1,ct_2,ct_3,ct_4,os_0,os_1,os_2,carrier_0,carrier_1,carrier_2,carrier_3,house_1 |
JOIN 操作转移至Spark
import re, time f = 'userFeature.data' ''' 17 marriageStatus 11 19 marriageStatus 2 13 20 marriageStatus 13 10 16 marriageStatus 0 21 marriageStatus 2 13 9 22 marriageStatus 12 13 9 23 marriageStatus 12 13 10 11 marriageStatus 11 5 13 marriageStatus 5 13 13 10 marriageStatus 13 10 10 marriageStatus 10 15 marriageStatus 15 0 marriageStatus 0 13 15 marriageStatus 13 15 12 13 marriageStatus 12 13 13 marriageStatus 13 6 13 marriageStatus 6 13 2 13 marriageStatus 2 13 13 9 marriageStatus 13 9 6 13 9 marriageStatus 6 13 9 2 13 9 marriageStatus 2 13 9 5 13 9 marriageStatus 5 13 9 12 13 9 marriageStatus 12 13 9 14 marriageStatus 14 12 13 10 marriageStatus 12 13 10 3 marriageStatus 3 15 10 marriageStatus 15 10 8 marriageStatus 8 6 13 10 marriageStatus 6 13 10 5 13 10 marriageStatus 5 13 10 13 10 9 marriageStatus 13 10 9 13 15 10 marriageStatus 13 15 10 2 13 10 marriageStatus 2 13 10 marriageStatus 0 2 5 6 8 9 10 11 12 13 15 ''' def fw_s(f, s): with open(f, 'w') as fw: fw.write(s) # d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征 # feature_order_l 获取一级特征的顺序 # 原始数据 特征挖掘 # 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序, f_feature = 'toknowit.txt' f_feature_ele_num = f_feature.replace('.', 'EleNum.') f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.', 'Complete.') # 原始数据 加工成生产数据 # 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定 f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.') # <---以上生成的文件无参数,数据恒定 # 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳 f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time()))) to_write_immutable_file = True # to_write_immutable_file = False if to_write_immutable_file: feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000 # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合 with open(f, 'r') as fr: for i in fr: l = i.replace('\n', '').split('|') feature_incomplete_rows_l_this = [] for ii in l: ll = ii.split(' ') k = ll[0] feature_incomplete_rows_l_this.append(k) if k == 'uid': continue if k not in feature_d: feature_d[k] = [] # order -->int for iii in ll[1:]: feature_d[k].append(int(iii)) feature_incomplete_rows_l.append(feature_incomplete_rows_l_this) reduce_chk_counter += 1 print(reduce_chk_counter) if reduce_chk_counter % reduce_chk_step == 0: # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数 for k in feature_d: feature_d[k] = list(set(feature_d[k])) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] # subset TEST # break for k in feature_d: feature_d[k] = sorted(list(set(feature_d[k])), reverse=False) feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if feature_incomplete_rows_l.index(e) == i] s = '\n'.join([','.join(l) for l in feature_incomplete_rows_l]) fw_s(f_feature_incomplete, s) feature_after_e_d = {} for l in feature_incomplete_rows_l: for e in l: if e not in feature_after_e_d: feature_after_e_d[e] = [] feature_after_e_d[e] += l[l.index(e) + 1:] feature_after_e_d[e] = list(set(feature_after_e_d[e])) # 原始一级特征b feature_complete_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)] print(feature_complete_l) s = '\n'.join(feature_complete_l) fw_s(f_feature_complete, s) print(feature_complete_l) feature_d_ = {} for feature in feature_complete_l: if feature == 'uid': continue feature_d_[feature] = feature_d[feature] del feature_d feature_d = feature_d_ s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d]) fw_s(f_feature, s) s = '\n'.join(['{}\n{}'.format(k, len(feature_d[k])) for k in feature_d]) fw_s(f_feature_ele_num, s) # 原始数据持久化完毕<--- # --->对原始数据做加工,生成新数据 # 0级别特征,将原始一级特征中"kw1,kw2,"合并 feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in feature_complete_l] # set 破坏了顺序 print(feature_reduce_l) print(list(set(feature_reduce_l))) feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i] print(feature_reduce_l) s = '\n'.join(feature_reduce_l) fw_s(f_reduce, s) relative_, absolute_ = 2 / 3, 50 sparse_num_drop_max = min( [absolute_, max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])]) s = '\n'.join( ['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d]) fw_s(f_output, s) # 特征的属性值 feature_extend_d = {} is_odd_line = True with open(f_output, 'r') as fr: for i in fr: l = i.replace('\n', '').split(',') if is_odd_line == True: is_odd_line = False k = l[0] feature_extend_d[k] = [] else: is_odd_line = True if len(l) <= sparse_num_drop_max: for ii in l: feature_extend_d[k].append(ii) else: feature_extend_d[k].append(0) feature_extend_l = [] feature_complete_l.pop(feature_complete_l.index('uid')) feature_extend_l = '|'.join( ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|') print(feature_extend_l) s = ','.join(feature_extend_l) fw_s(f_extend, s) # 生成缩小的数据集,测试spark join 广告特征做广播 ori_l, extend_l = [], [] with open('toknowitComplete.txt', 'r')as fr: ori_l = [i.replace('\n', '') for i in fr] feature_extend_l, f_feature_extend = [], f_extend with open(f_feature_extend, 'r')as fr: extend_l = [i.replace('\n', '').split(',') for i in fr][0] ori_extend_d = {} for ori in ori_l: for extend_ in extend_l: if ori in extend_: if ori not in ori_extend_d: ori_extend_d[ori] = {} extend_d = {extend_: 0} ori_extend_d[ori][extend_] = 0 import copy ori_extend_d_ = copy.deepcopy(ori_extend_d) for i in ori_extend_d_['age']: if 'marriageStatus' in i: del ori_extend_d['age'][i] del ori_extend_d_ ''' 1-生成数据元结构,末端值全为0 2-拿到每行数据,去更新末端值 ''' c_ = 0 rows_d_l = [] with open(f, 'r') as fr: for i in fr: # c_ += 1 # if c_ == 6: # break ori_row_l = i.replace('\n', '').split('|') ori_extend_d_this = copy.deepcopy(ori_extend_d) uid_d = {} for ii in ori_row_l: l = ii.split(' ') print(l) feature_ori, val_l = l[0], l[1:] if feature_ori == 'uid': uid = val_l[0] continue if len(ori_extend_d[feature_ori]) == 1: for feature_sub in ori_extend_d_this[feature_ori]: print(feature_sub) ori_extend_d_this[feature_ori][feature_sub] = 1 if int(val_l[0]) > 0 else 0 else: for val_ in val_l: feature_sub = '{}_{}'.format(feature_ori, val_) print(feature_sub) if feature_sub in ori_extend_d_this[feature_ori]: ###多余的校验 ori_extend_d_this[feature_ori][feature_sub] = 1 # ???? uid_d[uid] = ori_extend_d_this del ori_extend_d_this rows_d_l.append(uid_d) del uid_d s_l = [] f_userdata_extend = f.replace('.data', '{}.data'.format(int(time.time()))) for d in rows_d_l: for uid in d: c_ += 1 l = [] d_ = d[uid] for feature_ in d_: for feature_sub in d_[feature_]: l.append(d_[feature_][feature_sub]) s = '{},{}'.format(uid, ','.join([str(i) for i in l])) s_l.append(s) fw_s(f_userdata_extend, '\n'.join(s_l)) print(c_) ''' gen JOIN data FOR DNN ''' f_user = 'userFeature.data' f_ad = 'adFeature.csv' f_user_extend = f_userdata_extend f_train = 'train.csv' f_test = 'test2.csv' ''' gen head ''' csv_head = 'advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType' f_toknowitExtend = 'toknowitExtend1527038949.txt' try: with open(f_toknowitExtend, 'r') as fr: for i in fr: csv_head = 'label,{},{}'.format(i.replace('\n', ''), csv_head) print(csv_head) except Exception as e: print(e) csv_head = '' # no file ''' get dict ''' ad_d = {} with open(f_ad, 'r') as fr: for i in fr: if 'aid' in i: continue l = i.replace('\n', '').split(',') aid = l[0] ad_d[aid] = ','.join(l[1:]) uid_d = {} with open(f_user_extend, 'r') as fr: for i in fr: if 'aid' in i: continue l = i.replace('\n', '').split(',') uid = l[0] uid_d[uid] = ','.join(l[1:]) ''' gen train data ''' dnn_csvTRAIN = 'dnn_csvTRAIN{}.csv'.format(int(time.time())) with open(dnn_csvTRAIN, 'w') as fa: fa.write(csv_head) with open(f_train, 'r') as fr: for i in fr: if 'aid' in i: continue try: l = i.replace('\n', '').replace(' ', '').split(',') print(l) aid, uid, label = l s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid]) with open(dnn_csvTRAIN, 'a') as fa: fa.write(s) except Exception as e: print(e) ''' gen test data ''' dnn_csvTEST = 'dnn_csvTEST{}.csv'.format(int(time.time())) with open(dnn_csvTEST, 'w') as fa: fa.write(csv_head) with open(f_test, 'r') as fr: for i in fr: if 'aid' in i: continue try: break l = i.replace('\n', '').replace(' ', '').split(',') print(l) # aid, uid, label = l aid, uid = l label = 0 s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid]) with open(dnn_csvTEST, 'a') as fa: fa.write(s) except Exception as e: print(e) dd = 9
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
2017-05-18 Gamma函数
2017-05-18 归纳逻辑 贝叶斯主义