特征列 属性值 获取 vowpal wabbit 生成DNN 的训练测试数据

 

用户特征文件 userFeature.data 每 行 代 表 一 个 用 户 的 特 征 数 据, 格 式 为: “uid|features”,uid 和 features 用竖线“|”分隔。其中 feature 采用 vowpal wabbit(https://github.com/JohnLangford/vowpal_wabbit)格式: “feature_group1|feature_group2|feature_group3|...”。 每 个 feature_group 代表一个特征组,多个特征组之间也以竖线“|”分隔。一个特征组若包括多个值 则以空格分隔,格式为:“feature_group_name fea_name1 fea_name2 …”, 其中 fea_name 采用数据编号的格式。

 

 

 

特征列  属性值      获取

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
d = {}
with open(f, 'r') as fr:
    for i in fr:
        l = i.split('|')
        for ii in l:
            ll=ii.split(' ')
            k=ll[0]
            if k not in d:
                d[k]=[]
            for iii in ll[1:]:
                iii_=int(iii)
                if int(iii) not in d[k]:
                    d[k].append(iii_)
for k in d:
    l=sorted(d[k],reverse=False)
    print(k)
    print(l)

  

 

批处理 减小运行时间

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000
with open(f, 'r') as fr:
    for i in fr:
        l = i.split('|')
 
        for ii in l:
            ll = ii.split(' ')
            k = ll[0]
            if k not in d:
                d[k] = []
            for iii in ll[1:]:
                iii_ = int(iii)
                d[k].append(iii_)
 
        reduce_chk_counter += 1
        if reduce_chk_counter == reduce_chk_step:
            reduce_chk_counter = 0
            for k in d:
                d[k] = list(set(d[k]))
                l = sorted(d[k], reverse=False)
                print(k)
                print(l)
 
for k in d:
    d[k] = list(set(d[k]))
    l = sorted(d[k], reverse=False)
    print(k)
    print(l)
 
res_f = 'toknowit.txt'
with open(res_f, 'w') as fw:
    for k in d:
        l = sorted(d[k], reverse=False)
        print(k)
        print(l)
        s = '{}\n{}\n'.format(k, ','.join(l))
        fw.write(s)

  

将数据冗余至内存,批处理,去重,释放内存

 

复制代码
f = 'userFeature.data'
# 确定marriageStatus特征列的值
d, d1 = {}, {}
with open(f, 'r') as fr:
    for i in fr:
        break
        l = i.split('|')
        for ii in l:
            if 'marriageStatus' in ii:
                k = len(ii)
                d[k] = ii
                k = ii.split('marriageStatus')[-1]
                d1[k] = ii
for k in d:
    print(k, d[k])
for k in d1:
    print(k, d1[k])

'''
17 marriageStatus 11
19 marriageStatus 2 13
20 marriageStatus 13 10
16 marriageStatus 0
21 marriageStatus 2 13 9
22 marriageStatus 12 13 9
23 marriageStatus 12 13 10

 11 marriageStatus 11
 5 13 marriageStatus 5 13
 13 10 marriageStatus 13 10
 10 marriageStatus 10
 15 marriageStatus 15
 0 marriageStatus 0
 13 15 marriageStatus 13 15
 12 13 marriageStatus 12 13
 13 marriageStatus 13
 6 13 marriageStatus 6 13
 2 13 marriageStatus 2 13
 13 9 marriageStatus 13 9
 6 13 9 marriageStatus 6 13 9
 2 13 9 marriageStatus 2 13 9
 5 13 9 marriageStatus 5 13 9
 12 13 9 marriageStatus 12 13 9
 14 marriageStatus 14
 12 13 10 marriageStatus 12 13 10
 3 marriageStatus 3
 15 10 marriageStatus 15 10
 8 marriageStatus 8
 6 13 10 marriageStatus 6 13 10
 5 13 10 marriageStatus 5 13 10
 13 10 9 marriageStatus 13 10 9
 13 15 10 marriageStatus 13 15 10
 2 13 10 marriageStatus 2 13 10
 
 
 marriageStatus 0 2 5 6 8 9 10 11 12 13 15
'''

d = {}
with open(f, 'r') as fr:
    for i in fr:
        break
        l = i.split('|')
        for ii in l:
            ll = ii.split(' ')
            k = ll[0]
            if k not in d:
                d[k] = []
            for iii in ll[1:]:
                iii_ = int(iii)
                if int(iii) not in d[k]:
                    d[k].append(iii_)

d, reduce_chk_counter, reduce_chk_step = {}, 0, 500000
with open(f, 'r') as fr:
    for i in fr:

        break

        l = i.split('|')

        for ii in l:
            ll = ii.split(' ')
            k = ll[0]
            if k == 'uid':
                continue
            if k not in d:
                d[k] = []
            for iii in ll[1:]:
                iii_ = int(iii)
                d[k].append(iii_)

        reduce_chk_counter += 1
        if reduce_chk_counter == reduce_chk_step:
            reduce_chk_counter = 0
            for k in d:
                d[k] = list(set(d[k]))
                l = sorted(d[k], reverse=False)
                print(k)
                print(l)

for k in d:
    break

    d[k] = list(set(d[k]))
    l = sorted(d[k], reverse=False)
    print(k)
    print(l)

res_f = 'toknowit.txt'
# with open(res_f, 'w') as fw:
#     for k in d:
#         l = sorted(d[k], reverse=False)
#         print(k)
#         print(l)
#         s = '{}\n{}\n'.format(k, ','.join([str(i) for i in l]))
#         fw.write(s)
cut_l = []
with open(res_f, 'r') as fr:
    for i in fr:
        l = i.replace('\n', '').split(',')[0:200]
        cut_l.append(l)

res_f_cut = 'toknowitCUT.txt'

with open(res_f_cut, 'w') as fw:
    for l in cut_l:
        s = '{}\n'.format(','.join([str(i) for i in l]))
        fw.write(s)
复制代码

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
age
0,1,2,3,4,5
gender
0,1,2
marriageStatus
0,2,3,5,6,8,9,10,11,12,13,14,15
education
0,1,2,3,4,5,6,7
consumptionAbility
0,1,2
LBS
0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,23,25,26,27,29,30,31,32,33,35,36,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,60,61,62,63,64,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,94,95,97,98,99,100,101,102,103,104,105,106,107,108,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,135,137,138,139,142,143,144,145,146,147,149,150,151,152,153,154,155,157,158,159,160,161,163,165,168,170,171,172,173,174,175,176,177,178,179,180,181,183,184,185,186,188,189,190,191,192,193,194,195,197,198,199,200,201,202,203,204,206,208,209,210,211,212,214,215,216,217,218,219,220,222,223,224,225,227,229,232,233,234,235,236
interest1
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122
interest2
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,78,79,80,81,82
interest5
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136
kw1
2,3,8,13,17,19,21,28,29,39,41,42,43,46,56,59,65,68,69,70,71,72,74,86,87,88,90,92,95,100,101,105,106,109,111,112,113,119,121,123,125,131,133,136,139,141,142,143,145,150,152,156,157,162,163,166,169,172,173,174,176,177,180,181,183,184,185,186,191,199,203,204,209,211,214,216,230,235,240,242,243,246,249,260,263,265,268,269,271,272,278,279,283,284,289,291,292,295,302,303,304,307,313,317,321,322,323,331,336,341,343,344,351,354,357,358,359,366,367,369,370,372,373,375,376,377,378,380,381,382,390,391,393,396,401,402,406,407,408,409,411,414,417,423,429,433,434,437,438,441,442,443,449,456,464,465,468,472,473,475,477,478,480,482,485,486,487,495,496,497,504,506,507,511,513,521,522,526,532,536,541,542,546,547,560,561,563,566,567,575,576,578,581,584,588,592,594,604,605,610
kw2
2,6,7,9,10,11,12,14,21,22,23,25,26,30,34,38,40,41,42,43,44,46,47,50,55,56,62,63,66,69,70,71,72,74,75,76,77,78,80,81,84,85,87,89,90,91,94,95,100,112,114,116,117,118,119,121,123,124,127,128,129,130,133,135,137,142,143,144,148,149,151,153,154,156,157,158,163,168,171,174,176,177,180,183,184,186,192,193,195,196,197,200,202,203,215,216,217,219,221,223,228,229,235,237,238,240,241,246,248,250,255,258,260,263,266,269,272,275,276,278,280,286,287,290,294,295,296,297,301,302,303,305,313,317,321,323,327,330,333,334,338,339,340,341,343,344,345,347,354,358,359,363,366,368,369,371,374,375,377,378,380,383,384,386,391,393,394,395,398,399,400,403,404,405,408,409,412,413,417,418,422,427,433,436,438,440,442,445,447,448,451,453,454,455,456,457,459,461,462,463
topic1
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
topic2
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
ct
0,1,2,3,4
os
0,1,2
carrier
0,1,2,3
house
1
interest3
1,2,3,4,5,6,7,8,9,10
kw3
1,7,8,10,15,19,25,27,29,36,50,56,63,68,69,74,77,80,88,93,95,101,117,122,123,124,126,132,133,136,138,149,151,152,153,155,157,164,167,171,173,174,181,186,188,190,192,194,197,198,206,209,213,223,228,233,235,248,249,253,263,273,276,278,280,286,288,295,302,303,311,314,316,323,328,331,332,333,336,343,349,362,364,366,370,372,381,385,391,394,399,401,404,411,412,416,420,425,427,431,453,459,464,465,469,470,474,488,499,504,505,508,512,513,523,530,531,534,539,549,559,560,563,566,568,570,574,581,586,588,598,607,610,617,627,630,633,634,635,636,638,645,650,654,655,657,663,668,676,677,681,685,686,687,691,692,694,695,696,699,701,703,705,707,709,719,722,723,725,734,735,737,739,740,742,745,751,755,763,764,769,771,780,785,788,799,800,805,809,818,821,833,835,836,840,851,853,856,860,862
topic3
1,3,4,8,10,11,14,16,18,19,21,22,23,24,25,27,28,30,31,32,33,34,35,37,39,42,43,44,46,47,49,51,53,54,55,56,58,59,60,62,63,65,66,68,69,70,72,75,76,78,79,81,84,87,88,90,92,95,98,99,100,101,102,103,107,108,109,111,112,113,115,116,117,119,120,121,123,124,126,127,129,130,132,133,136,137,138,139,141,142,143,146,148,150,151,154,157,158,159,161,162,164,165,166,167,168,169,171,174,176,177,178,180,182,183,185,186,187,188,190,191,192,193,194,197,198,199,201,202,205,206,207,209,210,211,212,213,214,215,216,217,218,219,220,221,223,226,227,228,232,233,234,235,237,238,240,241,243,251,252,253,255,256,258,259,260,262,264,265,266,267,268,269,270,271,272,273,274,275,278,279,280,282,283,285,287,288,292,297,298,299,301,304,305,306,307,308,309,312,314
interest4
1,2,3,4,5,6,7,8,9,10
appIdAction
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
appIdInstall
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
res_f_cut = 'toknowitCUT.txt'
 
# with open(res_f_cut, 'w') as fw:
#     s = '\n'.join([','.join([str(i) for i in l]) for l in cut_l])
#     fw.write(s)
 
sparse_num_drop_max, is_odd_line = 50, True
with open(res_f_cut, 'r') as fr:
    for i in fr:
        l = i.replace('\n', '').split(',')
        if is_odd_line == True:
            is_odd_line = False
            k = l[0]
        else:
            is_odd_line = True
            if len(l) <= sparse_num_drop_max:
                for ii in l:
                    s = '{}_{}'.format(k, ii)
                    print(s)
            else:
                print(k)

  

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
age_0
age_1
age_2
age_3
age_4
age_5
gender_0
gender_1
gender_2
marriageStatus_0
marriageStatus_2
marriageStatus_3
marriageStatus_5
marriageStatus_6
marriageStatus_8
marriageStatus_9
marriageStatus_10
marriageStatus_11
marriageStatus_12
marriageStatus_13
marriageStatus_14
marriageStatus_15
education_0
education_1
education_2
education_3
education_4
education_5
education_6
education_7
consumptionAbility_0
consumptionAbility_1
consumptionAbility_2
LBS
interest1
interest2
interest5
kw1
kw2
topic1
topic2
ct_0
ct_1
ct_2
ct_3
ct_4
os_0
os_1
os_2
carrier_0
carrier_1
carrier_2
carrier_3
house_1
interest3_1
interest3_2
interest3_3
interest3_4
interest3_5
interest3_6
interest3_7
interest3_8
interest3_9
interest3_10
kw3
topic3
interest4_1
interest4_2
interest4_3
interest4_4
interest4_5
interest4_6
interest4_7
interest4_8
interest4_9
interest4_10
appIdAction
appIdInstall

  

 

 

 

1
2
3
['uid', 'age', 'gender', 'marriageStatus', 'education', 'consumptionAbility', 'LBS', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os', 'carrier', 'house']
 
['age_0', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'gender_0', 'gender_1', 'gender_2', 'marriageStatus_0', 'marriageStatus_2', 'marriageStatus_3', 'marriageStatus_5', 'marriageStatus_6', 'marriageStatus_8', 'marriageStatus_9', 'marriageStatus_10', 'marriageStatus_11', 'marriageStatus_12', 'marriageStatus_13', 'marriageStatus_14', 'marriageStatus_15', 'education_0', 'education_1', 'education_2', 'education_3', 'education_4', 'education_5', 'education_6', 'education_7', 'consumptionAbility_0', 'consumptionAbility_1', 'consumptionAbility_2', 'LBS_0', 'interest1_0', 'interest2_0', 'interest3_1', 'interest3_2', 'interest3_3', 'interest3_4', 'interest3_5', 'interest3_6', 'interest3_7', 'interest3_8', 'interest3_9', 'interest3_10', 'interest4_1', 'interest4_2', 'interest4_3', 'interest4_4', 'interest4_5', 'interest4_6', 'interest4_7', 'interest4_8', 'interest4_9', 'interest4_10', 'interest5_0', 'kw1_0', 'kw2_0', 'kw3_0', 'topic1_0', 'topic2_0', 'topic3_0', 'appIdInstall_0', 'appIdAction_0', 'ct_0', 'ct_1', 'ct_2', 'ct_3', 'ct_4', 'os_0', 'os_1', 'os_2', 'carrier_0', 'carrier_1', 'carrier_2', 'carrier_3', 'house_1']

  

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
f = 'userFeature.data'
'''
17 marriageStatus 11
19 marriageStatus 2 13
20 marriageStatus 13 10
16 marriageStatus 0
21 marriageStatus 2 13 9
22 marriageStatus 12 13 9
23 marriageStatus 12 13 10
 
 11 marriageStatus 11
 5 13 marriageStatus 5 13
 13 10 marriageStatus 13 10
 10 marriageStatus 10
 15 marriageStatus 15
 0 marriageStatus 0
 13 15 marriageStatus 13 15
 12 13 marriageStatus 12 13
 13 marriageStatus 13
 6 13 marriageStatus 6 13
 2 13 marriageStatus 2 13
 13 9 marriageStatus 13 9
 6 13 9 marriageStatus 6 13 9
 2 13 9 marriageStatus 2 13 9
 5 13 9 marriageStatus 5 13 9
 12 13 9 marriageStatus 12 13 9
 14 marriageStatus 14
 12 13 10 marriageStatus 12 13 10
 3 marriageStatus 3
 15 10 marriageStatus 15 10
 8 marriageStatus 8
 6 13 10 marriageStatus 6 13 10
 5 13 10 marriageStatus 5 13 10
 13 10 9 marriageStatus 13 10 9
 13 15 10 marriageStatus 13 15 10
 2 13 10 marriageStatus 2 13 10
  
 marriageStatus 0 2 5 6 8 9 10 11 12 13 15
'''
 
 
def fw_s(f, s):
    with open(f, 'w') as fw:
        fw.write(s)
 
 
# d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
# k_oreder_l 获取一级特征的顺序
 
 
res_f = 'toknowit.txt'
res_f_k_order = res_f.replace('.', 'KeyOrder.')
res_f_cut, children_val_max = res_f.replace('.', 'Cut.'), 50
 
to_write = True
to_write = False
if to_write:
    d, reduce_chk_counter, reduce_chk_step, k_oreder_l = {}, 0, 500000, []
    with open(f, 'r') as fr:
        for i in fr:
            l = i.replace('\n', '').split('|')
            k_order_l_this = []
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                k_order_l_this.append(k)
                if k == 'uid':
                    continue
                if k not in d:
                    d[k] = []
                # order -->int
                for iii in ll[1:]:
                    d[k].append(int(iii))
            k_oreder_l.append(k_order_l_this)
            reduce_chk_counter += 1
            print(reduce_chk_counter)
            if reduce_chk_counter % reduce_chk_step == 0:
                # reduce_chk_counter = 0
                for k in d:
                    d[k] = list(set(d[k]))
                    k_oreder_l = [e for i, e in enumerate(k_oreder_l) if
                                  k_oreder_l.index(e) == i]  # set()  TypeError  unhashable type:'list'
 
    for k in d:
        d[k] = sorted(list(set(d[k])), reverse=False)
 
    k_oreder_l = [e for i, e in enumerate(k_oreder_l) if
                  k_oreder_l.index(e) == i]
 
    s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in d[k]])) for k in d])
    fw_s(res_f, s)
 
    s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in d[k][0:children_val_max]])) for k in d])
    fw_s(res_f_cut, s)
 
    s = '\n'.join(['|'.join(l) for l in k_oreder_l])
    fw_s(res_f_k_order, s)
 
with open(res_f_k_order, 'r') as fr:
    ori_feature_l = [i.replace('\n', '').split('|') for i in fr]
feature_after_e_d = {}
for l in ori_feature_l:
    for e in l:
        if e not in feature_after_e_d:
            feature_after_e_d[e] = []
        feature_after_e_d[e] += l[l.index(e) + 1:]
        feature_after_e_d[e] = list(set(feature_after_e_d[e]))
 
feature_l = [k for k in sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
print(feature_l)
 
import re
 
feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in feature_l]
# set  破坏了顺序
print(feature_reduce_l)
print(list(set(feature_reduce_l)))
 
feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
print(feature_reduce_l)
 
sparse_num_drop_max, is_odd_line = 20, True
# 特征的属性值
res_d = {}
with open(res_f_cut, 'r') as fr:
    for i in fr:
        l = i.replace('\n', '').split(',')
        if is_odd_line == True:
            is_odd_line = False
            k = l[0]
            res_d[k] = []
        else:
            is_odd_line = True
            if len(l) <= sparse_num_drop_max:
                for ii in l:
                    res_d[k].append(ii)
            else:
                res_d[k].append(0)
feature_expand_l = []
 
feature_l.pop(feature_l.index('uid'))
for k in feature_l:
    feature_expand_l += ['{}_{}'.format(k, i) for i in res_d[k]]
 
print(feature_expand_l)
 
dd = 5

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import re, time
 
f = 'userFeature.data'
 
'''
17 marriageStatus 11
19 marriageStatus 2 13
20 marriageStatus 13 10
16 marriageStatus 0
21 marriageStatus 2 13 9
22 marriageStatus 12 13 9
23 marriageStatus 12 13 10
 
 11 marriageStatus 11
 5 13 marriageStatus 5 13
 13 10 marriageStatus 13 10
 10 marriageStatus 10
 15 marriageStatus 15
 0 marriageStatus 0
 13 15 marriageStatus 13 15
 12 13 marriageStatus 12 13
 13 marriageStatus 13
 6 13 marriageStatus 6 13
 2 13 marriageStatus 2 13
 13 9 marriageStatus 13 9
 6 13 9 marriageStatus 6 13 9
 2 13 9 marriageStatus 2 13 9
 5 13 9 marriageStatus 5 13 9
 12 13 9 marriageStatus 12 13 9
 14 marriageStatus 14
 12 13 10 marriageStatus 12 13 10
 3 marriageStatus 3
 15 10 marriageStatus 15 10
 8 marriageStatus 8
 6 13 10 marriageStatus 6 13 10
 5 13 10 marriageStatus 5 13 10
 13 10 9 marriageStatus 13 10 9
 13 15 10 marriageStatus 13 15 10
 2 13 10 marriageStatus 2 13 10
 
 marriageStatus 0 2 5 6 8 9 10 11 12 13 15
'''
 
 
def fw_s(f, s):
    with open(f, 'w') as fw:
        fw.write(s)
 
 
# d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
# feature_order_l 获取一级特征的顺序
 
# 原始数据 特征挖掘
# 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序,
f_feature = 'toknowit.txt'
f_feature_ele_num = f_feature.replace('.', 'EleNum.')
f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.',
                                                                                                    'Complete.')
 
# 原始数据 加工成生产数据
# 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定
f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.')
 
# <---以上生成的文件无参数,数据恒定
 
 
# 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳
 
f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time())))
to_write_immutable_file = True
# to_write_immutable_file = False
if to_write_immutable_file:
    feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000
 
    # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合
 
    with open(f, 'r') as fr:
        for i in fr:
            l = i.replace('\n', '').split('|')
            feature_incomplete_rows_l_this = []
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                feature_incomplete_rows_l_this.append(k)
                if k == 'uid':
                    continue
                if k not in feature_d:
                    feature_d[k] = []
                # order -->int
                for iii in ll[1:]:
                    feature_d[k].append(int(iii))
            feature_incomplete_rows_l.append(feature_incomplete_rows_l_this)
            reduce_chk_counter += 1
            print(reduce_chk_counter)
            if reduce_chk_counter % reduce_chk_step == 0:
                # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数
                for k in feature_d:
                    feature_d[k] = list(set(feature_d[k]))
                    feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                                 feature_incomplete_rows_l.index(e) == i]
                # subset TEST
                break
 
    for k in feature_d:
        feature_d[k] = sorted(list(set(feature_d[k])), reverse=False)
 
    feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                 feature_incomplete_rows_l.index(e) == i]
 
    s = '\n'.join([','.join(l) for l in feature_incomplete_rows_l])
    fw_s(f_feature_incomplete, s)
 
    feature_after_e_d = {}
    for l in feature_incomplete_rows_l:
        for e in l:
            if e not in feature_after_e_d:
                feature_after_e_d[e] = []
            feature_after_e_d[e] += l[l.index(e) + 1:]
            feature_after_e_d[e] = list(set(feature_after_e_d[e]))
 
    # 原始一级特征b
    feature_complete_l = [k for k in
                          sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
    print(feature_complete_l)
 
    s = '\n'.join(feature_complete_l)
    fw_s(f_feature_complete, s)
 
    print(feature_complete_l)
 
    feature_d_ = {}
    for feature in feature_complete_l:
        if feature == 'uid':
            continue
        feature_d_[feature] = feature_d[feature]
    del feature_d
    feature_d = feature_d_
 
    s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d])
    fw_s(f_feature, s)
 
    s = '\n'.join(['{}\n{}'.format(k, len(feature_d[k])) for k in feature_d])
    fw_s(f_feature_ele_num, s)
 
    # 原始数据持久化完毕<---
 
    # --->对原始数据做加工,生成新数据
 
    # 0级别特征,将原始一级特征中"kw1,kw2,"合并
    feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in
                        feature_complete_l]
    # set  破坏了顺序
    print(feature_reduce_l)
    print(list(set(feature_reduce_l)))
 
    feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
    print(feature_reduce_l)
    s = '\n'.join(feature_reduce_l)
    fw_s(f_reduce, s)
 
    relative_, absolute_ = 2 / 3, 50
    sparse_num_drop_max = min(
        [absolute_,
         max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])])
 
    s = '\n'.join(
        ['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d])
 
    fw_s(f_output, s)
 
    # 特征的属性值
    feature_extend_d = {}
    is_odd_line = True
    with open(f_output, 'r') as fr:
        for i in fr:
            l = i.replace('\n', '').split(',')
            if is_odd_line == True:
                is_odd_line = False
                k = l[0]
                feature_extend_d[k] = []
            else:
                is_odd_line = True
                if len(l) <= sparse_num_drop_max:
                    for ii in l:
                        feature_extend_d[k].append(ii)
                else:
                    feature_extend_d[k].append(0)
    feature_extend_l = []
 
    feature_complete_l.pop(feature_complete_l.index('uid'))
 
    feature_extend_l = '|'.join(
        ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|')
 
    print(feature_extend_l)
    s = ','.join(feature_extend_l)
    fw_s(f_extend, s)
 
# 生成缩小的数据集,测试spark join  广告特征做广播
 
# feature_extend_l, f_extend = [], 'toknowitExtend1526836898.txt'
#
# with open(f_extend, 'r') as fr:
#     feature_extend_l = [i.replace('\n', '') for i in fr][0].split(',')
 
d = 8

  

 

1
age_0,age_1,age_2,age_3,age_4,age_5,gender_0,gender_1,gender_2,marriageStatus_0,marriageStatus_2,marriageStatus_3,marriageStatus_5,marriageStatus_6,marriageStatus_9,marriageStatus_10,marriageStatus_11,marriageStatus_12,marriageStatus_13,marriageStatus_14,marriageStatus_15,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,consumptionAbility_0,consumptionAbility_1,consumptionAbility_2,LBS_0,LBS_1,LBS_4,LBS_6,LBS_7,LBS_8,LBS_9,LBS_13,LBS_14,LBS_15,LBS_16,LBS_18,LBS_19,LBS_21,LBS_25,LBS_27,LBS_29,LBS_32,LBS_33,LBS_35,LBS_38,LBS_39,LBS_41,LBS_43,LBS_45,LBS_46,LBS_47,LBS_48,LBS_49,LBS_52,LBS_54,LBS_56,LBS_57,LBS_61,LBS_62,LBS_63,LBS_64,LBS_66,LBS_69,LBS_71,LBS_72,LBS_73,LBS_75,LBS_77,LBS_78,LBS_81,LBS_83,LBS_84,LBS_85,LBS_86,interest1_1,interest1_2,interest1_3,interest1_4,interest1_5,interest1_6,interest1_7,interest1_8,interest1_9,interest1_10,interest1_11,interest1_12,interest1_13,interest1_14,interest1_15,interest1_16,interest1_17,interest1_18,interest1_19,interest1_20,interest1_21,interest1_22,interest1_23,interest1_24,interest1_25,interest1_26,interest1_27,interest1_28,interest1_29,interest1_30,interest1_31,interest1_32,interest1_33,interest1_34,interest1_35,interest1_36,interest1_37,interest1_38,interest1_39,interest1_40,interest1_41,interest1_42,interest1_43,interest1_44,interest1_45,interest1_46,interest1_47,interest1_48,interest1_49,interest1_50,interest2_1,interest2_2,interest2_3,interest2_4,interest2_5,interest2_6,interest2_7,interest2_8,interest2_9,interest2_10,interest2_11,interest2_12,interest2_13,interest2_14,interest2_15,interest2_16,interest2_17,interest2_18,interest2_19,interest2_20,interest2_21,interest2_22,interest2_23,interest2_24,interest2_25,interest2_26,interest2_27,interest2_28,interest2_29,interest2_30,interest2_31,interest2_32,interest2_33,interest2_35,interest2_36,interest2_37,interest2_38,interest2_39,interest2_40,interest2_41,interest2_42,interest2_43,interest2_44,interest2_45,interest2_46,interest2_47,interest2_48,interest2_49,interest2_50,interest2_51,interest3_1,interest3_2,interest3_3,interest3_4,interest3_5,interest3_6,interest3_7,interest3_8,interest3_9,interest3_10,interest4_1,interest4_2,interest4_3,interest4_4,interest4_5,interest4_6,interest4_7,interest4_8,interest4_9,interest4_10,interest5_1,interest5_2,interest5_3,interest5_4,interest5_5,interest5_6,interest5_7,interest5_8,interest5_9,interest5_10,interest5_11,interest5_12,interest5_13,interest5_14,interest5_15,interest5_16,interest5_17,interest5_18,interest5_19,interest5_20,interest5_21,interest5_22,interest5_23,interest5_24,interest5_25,interest5_26,interest5_27,interest5_28,interest5_29,interest5_30,interest5_31,interest5_32,interest5_33,interest5_34,interest5_35,interest5_36,interest5_37,interest5_38,interest5_39,interest5_40,interest5_41,interest5_42,interest5_43,interest5_44,interest5_45,interest5_46,interest5_47,interest5_48,interest5_49,interest5_50,kw1_13,kw1_19,kw1_28,kw1_69,kw1_70,kw1_72,kw1_87,kw1_92,kw1_105,kw1_106,kw1_109,kw1_119,kw1_121,kw1_123,kw1_133,kw1_136,kw1_145,kw1_152,kw1_157,kw1_163,kw1_169,kw1_176,kw1_177,kw1_180,kw1_181,kw1_191,kw1_209,kw1_235,kw1_242,kw1_249,kw1_278,kw1_279,kw1_289,kw1_295,kw1_313,kw1_317,kw1_321,kw1_336,kw1_341,kw1_344,kw1_354,kw1_358,kw1_366,kw1_367,kw1_370,kw1_376,kw1_378,kw1_380,kw1_382,kw1_391,kw2_2,kw2_10,kw2_11,kw2_34,kw2_46,kw2_47,kw2_50,kw2_55,kw2_62,kw2_63,kw2_69,kw2_70,kw2_76,kw2_87,kw2_91,kw2_100,kw2_114,kw2_116,kw2_117,kw2_123,kw2_124,kw2_127,kw2_129,kw2_135,kw2_137,kw2_142,kw2_144,kw2_151,kw2_158,kw2_163,kw2_168,kw2_174,kw2_177,kw2_180,kw2_184,kw2_192,kw2_196,kw2_197,kw2_200,kw2_202,kw2_215,kw2_216,kw2_217,kw2_223,kw2_235,kw2_237,kw2_240,kw2_241,kw2_246,kw2_250,kw3_7,kw3_27,kw3_29,kw3_68,kw3_80,kw3_88,kw3_95,kw3_101,kw3_138,kw3_171,kw3_186,kw3_197,kw3_198,kw3_206,kw3_213,kw3_223,kw3_248,kw3_263,kw3_273,kw3_302,kw3_316,kw3_336,kw3_349,kw3_362,kw3_381,kw3_401,kw3_412,kw3_416,kw3_453,kw3_465,kw3_470,kw3_488,kw3_513,kw3_534,kw3_549,kw3_560,kw3_570,kw3_581,kw3_586,kw3_598,kw3_610,kw3_627,kw3_633,kw3_638,kw3_668,kw3_685,kw3_692,kw3_694,kw3_695,kw3_701,topic1_0,topic1_1,topic1_2,topic1_3,topic1_4,topic1_5,topic1_6,topic1_7,topic1_9,topic1_10,topic1_11,topic1_12,topic1_13,topic1_14,topic1_15,topic1_16,topic1_17,topic1_18,topic1_19,topic1_20,topic1_21,topic1_22,topic1_23,topic1_24,topic1_25,topic1_26,topic1_27,topic1_28,topic1_29,topic1_30,topic1_31,topic1_32,topic1_33,topic1_34,topic1_35,topic1_36,topic1_37,topic1_38,topic1_39,topic1_40,topic1_41,topic1_42,topic1_43,topic1_44,topic1_45,topic1_46,topic1_47,topic1_48,topic1_49,topic1_50,topic2_0,topic2_2,topic2_3,topic2_4,topic2_5,topic2_6,topic2_7,topic2_9,topic2_10,topic2_11,topic2_13,topic2_14,topic2_15,topic2_16,topic2_17,topic2_19,topic2_20,topic2_21,topic2_22,topic2_24,topic2_25,topic2_26,topic2_27,topic2_28,topic2_29,topic2_30,topic2_31,topic2_32,topic2_33,topic2_34,topic2_35,topic2_36,topic2_39,topic2_40,topic2_41,topic2_42,topic2_43,topic2_44,topic2_45,topic2_46,topic2_47,topic2_48,topic2_49,topic2_50,topic2_51,topic2_52,topic2_53,topic2_54,topic2_55,topic2_56,topic3_3,topic3_10,topic3_11,topic3_14,topic3_18,topic3_24,topic3_28,topic3_30,topic3_31,topic3_33,topic3_39,topic3_42,topic3_43,topic3_47,topic3_53,topic3_55,topic3_56,topic3_58,topic3_59,topic3_60,topic3_62,topic3_66,topic3_68,topic3_70,topic3_72,topic3_76,topic3_78,topic3_79,topic3_81,topic3_84,topic3_87,topic3_90,topic3_92,topic3_99,topic3_100,topic3_101,topic3_109,topic3_111,topic3_112,topic3_119,topic3_121,topic3_123,topic3_124,topic3_127,topic3_130,topic3_136,topic3_137,topic3_138,topic3_139,topic3_141,appIdInstall_1,appIdInstall_4,appIdInstall_6,appIdInstall_9,appIdInstall_10,appIdInstall_11,appIdInstall_12,appIdInstall_15,appIdInstall_16,appIdInstall_17,appIdInstall_19,appIdInstall_21,appIdInstall_23,appIdInstall_26,appIdInstall_27,appIdInstall_28,appIdInstall_29,appIdInstall_32,appIdInstall_34,appIdInstall_35,appIdInstall_39,appIdInstall_40,appIdInstall_41,appIdInstall_42,appIdInstall_43,appIdInstall_44,appIdInstall_45,appIdInstall_47,appIdInstall_48,appIdInstall_49,appIdInstall_51,appIdInstall_52,appIdInstall_55,appIdInstall_56,appIdInstall_57,appIdInstall_58,appIdInstall_60,appIdInstall_61,appIdInstall_62,appIdInstall_63,appIdInstall_65,appIdInstall_67,appIdInstall_68,appIdInstall_69,appIdInstall_70,appIdInstall_71,appIdInstall_73,appIdInstall_74,appIdInstall_76,appIdInstall_77,appIdAction_2,appIdAction_4,appIdAction_5,appIdAction_7,appIdAction_8,appIdAction_11,appIdAction_13,appIdAction_14,appIdAction_16,appIdAction_17,appIdAction_27,appIdAction_30,appIdAction_32,appIdAction_33,appIdAction_34,appIdAction_35,appIdAction_36,appIdAction_37,appIdAction_38,appIdAction_39,appIdAction_40,appIdAction_41,appIdAction_43,appIdAction_44,appIdAction_45,appIdAction_47,appIdAction_50,appIdAction_51,appIdAction_52,appIdAction_53,appIdAction_55,appIdAction_56,appIdAction_60,appIdAction_62,appIdAction_65,appIdAction_66,appIdAction_69,appIdAction_70,appIdAction_71,appIdAction_72,appIdAction_74,appIdAction_75,appIdAction_76,appIdAction_77,appIdAction_80,appIdAction_81,appIdAction_83,appIdAction_84,appIdAction_85,appIdAction_91,ct_0,ct_1,ct_2,ct_3,ct_4,os_0,os_1,os_2,carrier_0,carrier_1,carrier_2,carrier_3,house_1

  

 

JOIN 操作转移至Spark

 

 

复制代码
import re, time

f = 'userFeature.data'

'''
17 marriageStatus 11
19 marriageStatus 2 13
20 marriageStatus 13 10
16 marriageStatus 0
21 marriageStatus 2 13 9
22 marriageStatus 12 13 9
23 marriageStatus 12 13 10

 11 marriageStatus 11
 5 13 marriageStatus 5 13
 13 10 marriageStatus 13 10
 10 marriageStatus 10
 15 marriageStatus 15
 0 marriageStatus 0
 13 15 marriageStatus 13 15
 12 13 marriageStatus 12 13
 13 marriageStatus 13
 6 13 marriageStatus 6 13
 2 13 marriageStatus 2 13
 13 9 marriageStatus 13 9
 6 13 9 marriageStatus 6 13 9
 2 13 9 marriageStatus 2 13 9
 5 13 9 marriageStatus 5 13 9
 12 13 9 marriageStatus 12 13 9
 14 marriageStatus 14
 12 13 10 marriageStatus 12 13 10
 3 marriageStatus 3
 15 10 marriageStatus 15 10
 8 marriageStatus 8
 6 13 10 marriageStatus 6 13 10
 5 13 10 marriageStatus 5 13 10
 13 10 9 marriageStatus 13 10 9
 13 15 10 marriageStatus 13 15 10
 2 13 10 marriageStatus 2 13 10

 marriageStatus 0 2 5 6 8 9 10 11 12 13 15
'''


def fw_s(f, s):
    with open(f, 'w') as fw:
        fw.write(s)


# d 获取一个特征下有哪些值,如果值个数大于1,则考虑拆分该特征为各个子特征
# feature_order_l 获取一级特征的顺序

# 原始数据 特征挖掘
# 各个特征的子特征值,各个特征的子特征个数,各个特征的顺序,
f_feature = 'toknowit.txt'
f_feature_ele_num = f_feature.replace('.', 'EleNum.')
f_feature_incomplete, f_feature_complete = f_feature.replace('.', 'Incomplete.'), f_feature.replace('.',
                                                                                                    'Complete.')

# 原始数据 加工成生产数据
# 将一级特征的子特征升级为一级特征,但是限定原一级特征的子特征可以升级的特征数最大值;该值需结合算力、算法确定
f_reduce, f_output = f_feature.replace('.', 'Reduce.'), f_feature.replace('.', 'Output.')

# <---以上生成的文件无参数,数据恒定


# 这个文件是第一阶段的结果文件,含参数文件,故加上时间戳

f_extend = f_feature.replace('.', 'Extend{}.'.format(int(time.time())))
to_write_immutable_file = True
# to_write_immutable_file = False
if to_write_immutable_file:
    feature_d, feature_incomplete_rows_l, reduce_chk_counter, reduce_chk_step, = {}, [], 0, 300000

    # 打开GB文件,得到2个集合:获取不完整特征序列的集合、特征-特征值的集合

    with open(f, 'r') as fr:
        for i in fr:
            l = i.replace('\n', '').split('|')
            feature_incomplete_rows_l_this = []
            for ii in l:
                ll = ii.split(' ')
                k = ll[0]
                feature_incomplete_rows_l_this.append(k)
                if k == 'uid':
                    continue
                if k not in feature_d:
                    feature_d[k] = []
                # order -->int
                for iii in ll[1:]:
                    feature_d[k].append(int(iii))
            feature_incomplete_rows_l.append(feature_incomplete_rows_l_this)
            reduce_chk_counter += 1
            print(reduce_chk_counter)
            if reduce_chk_counter % reduce_chk_step == 0:
                # reduce_chk_counter = 0 #从节约内存的角度,应重置为0,测试阶段观察分析进度和数据总数
                for k in feature_d:
                    feature_d[k] = list(set(feature_d[k]))
                    feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                                 feature_incomplete_rows_l.index(e) == i]
                    # subset TEST
                    # break

    for k in feature_d:
        feature_d[k] = sorted(list(set(feature_d[k])), reverse=False)

    feature_incomplete_rows_l = [e for i, e in enumerate(feature_incomplete_rows_l) if
                                 feature_incomplete_rows_l.index(e) == i]

    s = '\n'.join([','.join(l) for l in feature_incomplete_rows_l])
    fw_s(f_feature_incomplete, s)

    feature_after_e_d = {}
    for l in feature_incomplete_rows_l:
        for e in l:
            if e not in feature_after_e_d:
                feature_after_e_d[e] = []
            feature_after_e_d[e] += l[l.index(e) + 1:]
            feature_after_e_d[e] = list(set(feature_after_e_d[e]))

    # 原始一级特征b
    feature_complete_l = [k for k in
                          sorted(feature_after_e_d, key=lambda e: len(feature_after_e_d[e]), reverse=True)]
    print(feature_complete_l)

    s = '\n'.join(feature_complete_l)
    fw_s(f_feature_complete, s)

    print(feature_complete_l)

    feature_d_ = {}
    for feature in feature_complete_l:
        if feature == 'uid':
            continue
        feature_d_[feature] = feature_d[feature]
    del feature_d
    feature_d = feature_d_

    s = '\n'.join(['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k]])) for k in feature_d])
    fw_s(f_feature, s)

    s = '\n'.join(['{}\n{}'.format(k, len(feature_d[k])) for k in feature_d])
    fw_s(f_feature_ele_num, s)

    # 原始数据持久化完毕<---

    # --->对原始数据做加工,生成新数据

    # 0级别特征,将原始一级特征中"kw1,kw2,"合并
    feature_reduce_l = [i if re.search('\d', i) is None else i[0:re.search('\d', i).endpos - 1] for i in
                        feature_complete_l]
    # set  破坏了顺序
    print(feature_reduce_l)
    print(list(set(feature_reduce_l)))

    feature_reduce_l = [e for i, e in enumerate(feature_reduce_l) if feature_reduce_l.index(e) == i]
    print(feature_reduce_l)
    s = '\n'.join(feature_reduce_l)
    fw_s(f_reduce, s)

    relative_, absolute_ = 2 / 3, 50
    sparse_num_drop_max = min(
        [absolute_,
         max(sorted([len(feature_d[k]) for k in feature_d], reverse=False)[0:int(len(feature_d) * relative_)])])

    s = '\n'.join(
        ['{}\n{}'.format(k, ','.join([str(i) for i in feature_d[k][0:sparse_num_drop_max]])) for k in feature_d])

    fw_s(f_output, s)

    # 特征的属性值
    feature_extend_d = {}
    is_odd_line = True
    with open(f_output, 'r') as fr:
        for i in fr:
            l = i.replace('\n', '').split(',')
            if is_odd_line == True:
                is_odd_line = False
                k = l[0]
                feature_extend_d[k] = []
            else:
                is_odd_line = True
                if len(l) <= sparse_num_drop_max:
                    for ii in l:
                        feature_extend_d[k].append(ii)
                else:
                    feature_extend_d[k].append(0)
    feature_extend_l = []

    feature_complete_l.pop(feature_complete_l.index('uid'))

    feature_extend_l = '|'.join(
        ['|'.join(['{}_{}'.format(k, str(i)) for i in feature_extend_d[k]]) for k in feature_extend_d]).split('|')

    print(feature_extend_l)
    s = ','.join(feature_extend_l)
    fw_s(f_extend, s)

# 生成缩小的数据集,测试spark join  广告特征做广播
ori_l, extend_l = [], []
with open('toknowitComplete.txt', 'r')as fr:
    ori_l = [i.replace('\n', '') for i in fr]

feature_extend_l, f_feature_extend = [], f_extend
with open(f_feature_extend, 'r')as fr:
    extend_l = [i.replace('\n', '').split(',') for i in fr][0]

ori_extend_d = {}

for ori in ori_l:
    for extend_ in extend_l:
        if ori in extend_:
            if ori not in ori_extend_d:
                ori_extend_d[ori] = {}
            extend_d = {extend_: 0}
            ori_extend_d[ori][extend_] = 0
import copy

ori_extend_d_ = copy.deepcopy(ori_extend_d)
for i in ori_extend_d_['age']:
    if 'marriageStatus' in i:
        del ori_extend_d['age'][i]
del ori_extend_d_

'''
1-生成数据元结构,末端值全为0

2-拿到每行数据,去更新末端值

'''
c_ = 0
rows_d_l = []
with open(f, 'r') as fr:
    for i in fr:
        # c_ += 1
        # if c_ == 6:
        #     break

        ori_row_l = i.replace('\n', '').split('|')
        ori_extend_d_this = copy.deepcopy(ori_extend_d)
        uid_d = {}
        for ii in ori_row_l:
            l = ii.split(' ')
            print(l)
            feature_ori, val_l = l[0], l[1:]
            if feature_ori == 'uid':
                uid = val_l[0]
                continue
            if len(ori_extend_d[feature_ori]) == 1:
                for feature_sub in ori_extend_d_this[feature_ori]:
                    print(feature_sub)
                    ori_extend_d_this[feature_ori][feature_sub] = 1 if int(val_l[0]) > 0 else 0
            else:
                for val_ in val_l:
                    feature_sub = '{}_{}'.format(feature_ori, val_)
                    print(feature_sub)
                    if feature_sub in ori_extend_d_this[feature_ori]:  ###多余的校验
                        ori_extend_d_this[feature_ori][feature_sub] = 1  # ????
        uid_d[uid] = ori_extend_d_this
        del ori_extend_d_this
        rows_d_l.append(uid_d)
        del uid_d

s_l = []
f_userdata_extend = f.replace('.data', '{}.data'.format(int(time.time())))
for d in rows_d_l:
    for uid in d:
        c_ += 1
        l = []
        d_ = d[uid]
        for feature_ in d_:
            for feature_sub in d_[feature_]:
                l.append(d_[feature_][feature_sub])
        s = '{},{}'.format(uid, ','.join([str(i) for i in l]))
        s_l.append(s)
fw_s(f_userdata_extend, '\n'.join(s_l))
print(c_)

'''
gen JOIN data FOR DNN

'''
f_user = 'userFeature.data'
f_ad = 'adFeature.csv'
f_user_extend = f_userdata_extend
f_train = 'train.csv'
f_test = 'test2.csv'

'''
gen head
'''
csv_head = 'advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType'
f_toknowitExtend = 'toknowitExtend1527038949.txt'
try:
    with open(f_toknowitExtend, 'r') as fr:
        for i in fr:
            csv_head = 'label,{},{}'.format(i.replace('\n', ''), csv_head)
    print(csv_head)
except Exception as e:
    print(e)
    csv_head = ''  # no  file
'''
get dict
'''

ad_d = {}
with open(f_ad, 'r') as fr:
    for i in fr:
        if 'aid' in i:
            continue
        l = i.replace('\n', '').split(',')
        aid = l[0]
        ad_d[aid] = ','.join(l[1:])

uid_d = {}
with open(f_user_extend, 'r') as fr:
    for i in fr:
        if 'aid' in i:
            continue
        l = i.replace('\n', '').split(',')
        uid = l[0]
        uid_d[uid] = ','.join(l[1:])
'''
gen train data
'''
dnn_csvTRAIN = 'dnn_csvTRAIN{}.csv'.format(int(time.time()))
with open(dnn_csvTRAIN, 'w') as fa:
    fa.write(csv_head)
with open(f_train, 'r') as fr:
    for i in fr:
        if 'aid' in i:
            continue
        try:
            l = i.replace('\n', '').replace(' ', '').split(',')
            print(l)
            aid, uid, label = l
            s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid])
            with open(dnn_csvTRAIN, 'a') as fa:
                fa.write(s)
        except Exception as e:
            print(e)

'''
gen test data
'''

dnn_csvTEST = 'dnn_csvTEST{}.csv'.format(int(time.time()))
with open(dnn_csvTEST, 'w') as fa:
    fa.write(csv_head)
with open(f_test, 'r') as fr:
    for i in fr:
        if 'aid' in i:
            continue
        try:
            break
            l = i.replace('\n', '').replace(' ', '').split(',')
            print(l)
            # aid, uid, label = l
            aid, uid = l
            label = 0
            s = '{},{},{}\n'.format(label, uid_d[uid], ad_d[aid])
            with open(dnn_csvTEST, 'a') as fa:
                fa.write(s)
        except Exception as e:
            print(e)

dd = 9
复制代码

 

posted @   papering  阅读(347)  评论(0编辑  收藏  举报
编辑推荐:
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
历史上的今天:
2017-05-18 Gamma函数
2017-05-18 归纳逻辑 贝叶斯主义
点击右上角即可分享
微信分享提示