词库清洗系统
Python实现关键词词库清洗,信息量不少,慢慢感受!已经涵盖很多python实用的语法、思路,套路等,能学会举一反一,其实已经在seo工具辅助上有比较大的帮助。不再一味依赖技术人员,无论在seo思想、seo技术上走得更加宽和深。
一、创建first.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
# coding:utf-8'''第一步:凤巢关键词,排除包含空格,并保留searches>5且包含词根的关键词,输出到制定csv文件凤巢关键词扩展处理视情况处理下,去除与词根完全没关联的词,后保存为“word-search”两列数据即可inputfile == 凤巢关键词cigen == 凤巢关键词需要包含的词,一般为上线频道名称'''import csv,sys,MySQLdb,sys,pypinyin,csv,osinputfile,cigen = sys.argv[1:3]csv.field_size_limit(sys.maxsize)reader = csv.reader(file(inputfile,'rb'))csvfile = open('fcword.txt','wb')word_list = []print ">>> 开始过滤凤巢关键词......................................"for line in reader: #读取凤巢关键词,分离关键词和对应的搜索量 try: word = line[0] searches = line[1] if ' ' not in word and '的' not in word and cigen in word and int(searches)>5: #排除凤巢关键词中带空格,包含上线频道词根及搜索量大于词 term = '%s-%s' % (word,searches) word_list.append(term) except: continueprint ">>> 关键词过滤完成......................................"print ">>> 过滤结果写入fcword.txt......................................"for term in list(set(word_list)): #过滤后关键词写入fcword.txt word = term.split('-')[0] searches = term.split('-')[1] data = [] data.append(word) data.append(searches) writer = csv.writer(csvfile,dialect='excel') writer.writerow(data)print ">>> fcword.txt写入完成......................................" |
二、创建two.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#coding:utf-8'''第二步:获取包含制定词根(同凤巢扩展关键词使用词根一致)的上线词及详情'''import MySQLdb,sys,pypinyin,csv,osfrom pypinyin import pinyin, lazy_pinyinreload(sys) #重新加载syssys.setdefaultencoding("utf-8") #解决ASCII啥啥啥的报错cigen,py = sys.argv[1:3]csvfile1 = open('sqldata1.csv','wb')csvfile2 = open('sqldata2.csv','wb')csvfile3 = open('sqldata3.csv','wb')# 打开数据库连接db = MySQLdb.connect("{ip}","{user}","{password}","{database}",charset="utf8")# 使用cursor()方法获取操作游标cursor = db.cursor()##bug修复,匹配范围改为所有频道sql_z = 'select title,url from v9_zhuanti;'sql_l = 'select catname,url from v9_category;'sql_d = 'select title,url from v9_news'print ">>> 开始从mysql获取%s频带关键词数据......................................" % cigenprint ">>> 获取专题关键词中......................................"# 提取包含词根的专题关键词cursor.execute(sql_z)results = cursor.fetchall()nz = 0for row in results: zhuanti_name = row[0] zhuanti_url = row[1] nz += 1 data = [] data.append(zhuanti_name) data.append('http://www.domain.com/%s' % zhuanti_url) writer = csv.writer(csvfile1,dialect='excel') writer.writerow(data)print ">>> 获取栏目关键词中......................................"# 提取包含词根的栏目关键词cursor.execute(sql_l)results = cursor.fetchall()nl = 0for row in results: lanmu_name = row[0] lanmu_url = row[1] nl += 1 data = [] data.append(lanmu_name) data.append('http://www.domain.com/%s' % lanmu_url) writer = csv.writer(csvfile2,dialect='excel') writer.writerow(data)print ">>> 获取详情关键词中......................................"# 提取包含词根的详情页cursor.execute(sql_d)results = cursor.fetchall()nd = 0for row in results: detail_name = row[0] detail_url = row[1] nd += 1 data = [] data.append(detail_name) data.append('http://www.domain.com/%s' % detail_url) writer = csv.writer(csvfile3,dialect='excel') writer.writerow(data)print ">>> 获取栏目:%s" % str(nl)print ">>> 获取专题:%s" % str(nz)print ">>> 获取详情:%s" % str(nd)#关闭数据库连接db.close()print ">>> 关闭mysql连接......................................" |
三、创建three.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#coding:utf-8'''第三部:将凤巢关键词与上线关键词进行对比,完整匹配的存入pipei_word.txt待去重,未匹配存入nopipei_word.txt待分词判断相关性来挑选合适的着陆页页面'''import csv,os,sysinputfile,cigen,py = sys.argv[1:4]os.system("python fcword.py %s %s" % (inputfile,cigen))os.system("python sqldata.py %s %s" % (cigen,py))os.system("cat sqldata*.csv > hebing.csv")os.system("rm sqldata*.csv")csv.field_size_limit(sys.maxsize)f = open('fcword.txt','r')p = open('pipei_word.txt','w') #创建完整匹配关键词存放文件np = open('nopipei_word.txt','w') #创建未完整匹配关键词存放文件print ">>> 开始将凤巢关键词与上线关键词进行对比......................................"w = 0n = 0for term in f: #读取凤巢关键词 term = term.strip() word = term.split(',')[0] searches = term.split(',')[1] panding = 'no' #匹配判定,若完整匹配一次,则panding = word reader = csv.reader(file('hebing.csv','rb')) for line in reader: word_name = line[0] word_url = line[1] if word == word_name: # 匹配的关键词存入'pipei_word.txt' panding = word p.write("%s,%s,%s\n" % (panding,searches,word_url)) n += 1 else: continue if panding == 'no': #未匹配关键词存入'nopipei-word.txt' np.write("%s,%s\n" % (word,searches)) w += 1os.system("rm hebing.csv")print ">>> 匹配结束......................................"print ">>> 完整匹配关键词数:%s" % str(n)print ">>> 未匹配关键词数:%s" % str(w) |
四、创建four.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#coding:utf-8'''第4步:对完整匹配的关键词pipei_word.py进行去重,保留字数最多的那一个'''import sys,os,pycurl,StringIO,random,re,threadingfrom bs4 import BeautifulSoup as bsinputfile,cigen,py = sys.argv[1:4]os.system("python pipei.py %s %s %s" % (inputfile,cigen,py))f = open('pipei_word.txt','r')fd = open('pipei_word_1.txt','w')def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] ua = random.choice(uaList) return uaheaders = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Connection:keep-alive", "Host:www.domain.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.8.9", "User-Agent:%s" % getUA() ]def getHtml(url,headers): x = 0 while x < 10: x += 1 try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 content = c.fp.getvalue() # infoencode = chardet.detect(content).get('encoding','utf-8') # html = content.decode(infoencode,'ignore').encode(code) return content except: print "异常,重试>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" continueurl_list = []for line in f: line = line.strip() url_list.append(line)print ">>> 开始对同一个词被多次匹配的页面去重,保留正文字数最多的url......................................"#<!-- 执行爬虫功能的类 -->class getPic(threading.Thread): def __init__(self,url_list): threading.Thread.__init__(self) self.url_list = url_list self.timeout = 5 #<!-- 此处为具体的实现功能,按需修改此处 --> def downloadimg(self): for line in self.url_list: url = line.split(',')[2] html = getHtml(url,headers) if '<div id="Article">' in html: number = len(re.sub('<[^>]*?>','',str((bs(html)).find('div',{'id':'Article'})))) newline = '%s,%s' % (line,number) fd.write('%s\n' % newline) else: newline = '%s,%s' % (line,'0') fd.write('%s\n' % newline) def run(self): self.downloadimg()if __name__ == "__main__": getThreads = [] checkThreads = [] getPicThreads = []#开启100线程,将url_list分成100份,每个线程运行1份for i in range(5): t = getPic(url_list[((len(url_list)+4)/5) * i:((len(url_list)+4)/5) * (i+1)]) getPicThreads.append(t)for i in range(len(getPicThreads)): getPicThreads[i].start()for i in range(len(getPicThreads)): getPicThreads[i].join()print ">>> 完成"print ">>> 合并处理文件"os.system("rm pipei_word.txt")os.system("mv pipei_word_1.txt pipei_word.txt") |
五、创建five.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
|
#coding:utf-8'''第五步:1、剩余未完整匹配上的凤巢搜索词跑下站内搜索,提取并计算‘搜索结果数’,‘整词召回数’,‘主词召回数’,根据以上指标判定该词是否生成专题页还是匹配相似详情页title ps:新增专题数量需要限制'''import sys,os,pycurl,StringIO,random,re,threading,urllibfrom bs4 import BeautifulSoup as bscigen,py = sys.argv[1:3]f = open('nopipei_word.txt','r')zt = open('新增列表词.txt','w')wjg = open('无结果词.txt','w')xgt = open('detail匹配词.txt','w')def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] ua = random.choice(uaList) return uaheaders = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Connection:keep-alive", "Host:www.domain.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.8.9", "User-Agent:%s" % getUA() ]def getHtml(url,headers): x = 0 while x < 10: x += 1 try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 content = c.fp.getvalue() # infoencode = chardet.detect(content).get('encoding','utf-8') # html = content.decode(infoencode,'ignore').encode(code) return content except: print "异常,重试>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" time.sleep(3) continuedef search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return dataprint ">>> 开始导入站内搜索url......................................"url_list = []for line in f: line = line.strip() url = '{query_url}=%s' % urllib.quote(line.split(',')[0]) newline = '%s,%s' % (line,url) url_list.append(newline)print ">>> 完成"print ">>> 开始判定该词是否生成专题页还是匹配相似详情页title......................................"detail_word_list = []#<!-- 执行爬虫功能的类 -->class getPic(threading.Thread): def __init__(self,url_list): threading.Thread.__init__(self) self.url_list = url_list self.timeout = 5 #<!-- 此处为具体的实现功能,按需修改此处 --> def downloadimg(self): for line in self.url_list: word = line.split(',')[0] searches = line.split(',')[1] url = line.split(',')[2] html = getHtml(url,headers) #jieguo = re.sub('\(.*?\)|&[^;]*?;','',search('<b>分词结果:</b>(.*?)<hr>',html)).replace('-','') panding = re.sub(cigen,'',word) n = 0 #主词召回 m = 0 #整词召回 title_list = re.findall('<a[^>]*?>(.*?)</a>',html) #获取搜索结果的title生成列表 for title in title_list: # 比如‘网络营销策划书’,计算搜索结果中包含‘网络营销’和‘网络营销策划书’的详情页个数 if word in title: m += 1 if panding in title: n += 1 if m == 0 or n == 0: # 计算包含完成word,即‘网络营销策划书’的召回率 ratio = '0' else: ratio = str(format(float(int(m))/float(int(n)),'.0%')).replace('%','') number = search('b>结果数量:</b>(\d+)&',html) #获取word搜索结果数量 if int(number) >= 10 and n >5 : # 生成专题的判定条件 zt.write("%s,%s\n" % (word,searches)) #print word,searches,number else: if number == '0': #无搜索结果的词存入‘无结果词’文件中 #print word,searches,number wjg.write("%s,%s\n" % (word,searches)) else: if int(searches) > 70 and int(number) >=10: #搜索结果>10且searches>70补加到生成专题词中 zt.write("%s,%s\n" % (word,searches)) #print word,searches,number else: detail = search(r"href='(http://www.domain.com/%s/.*?)'" % py,html) #需更改title的详情页 if detail not in detail_word_list: #print word,searches,number,detail if detail != 'no': #print word,searches,number xgt.write("%s,%s,%s\n" % (word,searches,detail)) #print word,searches,detail,number detail_word_list.append(detail) else: wjg.write("%s,%s\n" % (word,searches)) def run(self): self.downloadimg()if __name__ == "__main__": getThreads = [] checkThreads = [] getPicThreads = []#开启100线程,将url_list分成100份,每个线程运行1份for i in range(3): t = getPic(url_list[((len(url_list)+2)/3) * i:((len(url_list)+2)/3) * (i+1)]) getPicThreads.append(t)for i in range(len(getPicThreads)): getPicThreads[i].start()for i in range(len(getPicThreads)): getPicThreads[i].join()print '>>> 完成' |
六、创建six.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#coding:utf-8'''第六步:词库清洗系统主程序,执行所有子程序并格式化结果文件'''import sys,osinputfile,cigen,py = sys.argv[1:4]os.system("python quchong.py %s %s %s" % (inputfile,cigen,py))os.system("python fenci.py %s %s" % (cigen,py))print "》》》数据统计:"print "新增列表词:"os.system("cat 新增列表词.txt|wc -l")print "detail匹配词:"os.system("cat detail匹配词.txt|wc -l")print "无结果词:"os.system("cat 无结果词.txt|wc -l")os.system("rm nopipei_word.txt") |

浙公网安备 33010602011771号