Python练习:优酷评论过滤(抓取当前视频全部评论,并过滤不包括所需关键词的留言)

 1 # coding:utf-8
 2 print('正在初始化...')
 3 import requests
 4 import re
 5 from lxml.html import fromstring
 6 import pyautogui
 7 import sys
 8 import os
 9 from multiprocessing.dummy import Pool
10 from lxml import _elementpath
11 
12 
13 class Youku_comment:
14 
15     """docstring for Youku_comment"""
16 
17     def __init__(self, raw_url):
18 
19         self.pid = re.findall('/id_(.*?)\.html', raw_url)[0]
20         r1 = requests.get(raw_url)
21         title = re.findall('<title>(.*?)</title>', r1.text)[0]
22         title = re.sub('\W', '', title).replace('在线播放优酷网视频高清在线观看', '')
23         totalpn = self.get_totalpn(self.pid)
24         print('视频ID:%s' % self.pid, '\n视频标题:%s' %
25               title, '\n总页码数:%s\n正在抓取...' % totalpn)
26 
27         pp = Pool(30)
28         pagenums = range(1, totalpn + 1)
29         result = pp.map(self.get_comment, pagenums)
30         pp.close()
31         pp.join()
32         result = [i for i in result if i]
33         self.aa = sum(result, [])
34         # print(jieguo)
35         # with open('%s.csv' % title, 'w', encoding='gbk') as f:
36         #     f.write(jieguo.encode('gbk', 'ignore').decode('gbk'))
37 
38     def get_totalpn(self, pid):
39         r = requests.get(
40             'http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":1}' % pid)
41         totalpn = (int(r.json()['totalSize'].replace(',', '')) // 30) + 1
42         return totalpn
43 
44     def get_comment(self, pagenum):
45         for _ in range(5):
46             try:
47                 r = requests.get(
48                     'http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":%s}' % (self.pid, pagenum), timeout=3)
49                 sjson = r.json()
50                 scode = sjson['con']
51                 ss = re.findall('<p id=".*?">.*?</p>', scode, flags=re.S)
52                 ss = [re.sub('<.*?>', '', i) for i in ss]
53                 if ss:
54                     return ss
55             except:
56                 pass
57 
58 
59 def filt1(str1, kws):
60     kws = kws.split(' ') if kws else 'OST 背景 音乐 旋律 歌曲 调子 music 耳熟 BGM 谁唱的 来自 出自 原声'.split(
61         ' ')
62     for i in kws:
63         if i in str1:
64             return str1
65 
66 
67 def quchong(ll):
68     ss = ''
69     for i in ll:
70         if i in ss:
71             continue
72         else:
73             ss = ss + '\n' + i
74     return ss
75 
76 while 1:
77     try:
78         url = pyautogui.prompt('请输入网址:')
79         if not url:
80             break
81         tt = Youku_comment(url)
82         pinglun = tt.aa
83         while 1:
84             kws = pyautogui.prompt('请输入关键词,多个请用空格隔开(直接回车则代表找背景音乐):')
85             kws = kws if kws else 0
86             ss = [filt1(i, kws) for i in pinglun]
87             ss = [i for i in ss if i]
88             ss = quchong(ss)
89             print('检索结果:\n')
90             print(ss)
91             jixu = pyautogui.confirm(
92                 text='是否要继续检索', title='请确认', buttons=['', ''])
93             if jixu == '':
94                 break
95 
96     except Exception as e:
97         print(e)
98         print('错误,请重试')
99 os.system('pause')

 


Windows已编译可执行文件: http://pan.baidu.com/s/1bn0jLmf

posted @ 2015-08-15 13:16  pyld  阅读(610)  评论(0编辑  收藏  举报