一点资讯 视频抓取 phantomjs
# _*_ coding: utf-8 _*_ """ 思路: 1.列表页使用phantomjs模拟点击2.每个链接只抓取第一页9-10条内容,按照标题去重 3.布置定时任务,每天8点执行一次 """ import MySQLdb import redis import sys import os import re import urllib import requests import time import hashlib import traceback import urlparse import random import signal # import multiprocessing import matplotlib matplotlib.use("Agg") import shutil import socket #图片下载延迟的 socket.setdefaulttimeout(30) import multiprocessing from config import IConfig from video_list import ydzx_url_list from bs4 import BeautifulSoup from upload_images import UploadFile from moviepy.editor import VideoFileClip from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities reload(sys) sys.setdefaultencoding('utf-8') class WxpnVideo(multiprocessing.Process): def __init__(self): self.redisConf = IConfig.load('resource.redis') self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd']) self.dbConfig = IConfig.load('resource.mysql') self.conn = MySQLdb.connect( user = self.dbConfig['user'], passwd = self.dbConfig['password'], db = self.dbConfig['dbname'], host = self.dbConfig['host'], charset = "utf8", use_unicode = True) self.conn.ping(True) self.cursor = self.conn.cursor() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'Host': 'www.yidianzixun.com', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } self.domain = IConfig.load('resource.domain') self.apiConf = IConfig.load('resource.apiurl') self.key_video_list = 'wxpn:video:list' self.key_title = 'wxpn:video:title' self.storeConfig = IConfig.load('resource.store') self.thumb_path = self.storeConfig['images_path'] self.ossConf = IConfig.load('resource.oss') self.key_id = self.ossConf['access_key_id'] self.key_secret = self.ossConf['access_key_secret'] self.endponit = self.ossConf['endponit'] self.img_upload = UploadFile() self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret) self.videoConf = IConfig.load('resource.apiurl') self.video_publish = self.videoConf['video_publish_api'] self.ydzx_page_api = self.videoConf['ydzx_page_api'] self.start_time = int(time.time()) multiprocessing.Process.__init__(self) def store_video_list_redis(self, video_list): if video_list: for per_list in video_list: if not self.redisServer.sismember(self.key_video_list, per_list): self.redisServer.sadd(self.key_video_list, per_list) else: return False def get_video_para(self): while True: if self.redisServer.scard(self.key_video_list) == 0: break link = self.redisServer.spop(self.key_video_list) print(link) # url = self.ydzx_page_api + link # try: # res = requests.get(url=url, timeout=60) # except Exception as e: # print('连接失败') # print(res.status_code) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" ) try: driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs') # driver.set_page_load_timeout(10) # driver.set_script_timeout(10) time.sleep(random.randrange(3, 8)) driver.get(link) time.sleep(random.randrange(2, 6)) text = driver.page_source driver.service.process.send_signal(signal.SIGTERM) driver.quit() except Exception as e: print(traceback.format_exc()) continue # if res.status_code == 200: soup = BeautifulSoup(text, 'lxml') title_list = soup.select('div.channel-news div.doc-title') itemid_list = soup.select('div.channel-news a.style-content-middle') if title_list and itemid_list: try: for num, title in enumerate(title_list): m = hashlib.md5() m.update(str(title.text).strip()) psw = m.hexdigest() print(title.text) itemid = itemid_list[num]['data-docid'] if not self.redisServer.sismember(self.key_title, psw): yield psw, itemid except Exception as e: print(traceback.format_exc()) continue else: print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)) # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link) # res = requests.post(self.msg_api, data=self.form_data) def time_cycle(self,origin_time): now = time.time() try: if origin_time == '昨天': published = int(now) - 24*3600 elif '天' in origin_time: day_one = re.compile('(.*?)天') published = int(now)-int(day_one.findall(origin_time)[0])*24*3600 elif '小时' in origin_time: hour_one = re.compile('(.*?)小时') published = int(now)-int(hour_one.findall(origin_time)[0])*24*60 elif '分' in origin_time: min_one = re.compile('(.*?)分') published = int(now)-int(min_one.findall(origin_time)[0])*60 elif '月' in origin_time: month_one = re.compile('(.*?)个月') published = int(now)-int(month_one.findall(origin_time)[0])*30*24*3600 else: timeArray = time.strptime(origin_time, "%Y.%m.%d") published = int(time.mktime(timeArray)) return published except Exception as e: print(traceback.format_exc()) def download_video(self, psw, itemid): now = int(time.time()) url = 'http://www.yidianzixun.com/article/' + itemid print(url) self.headers['Referer'] = url try: res = requests.get(url=url, headers=self.headers, timeout=60) print(res.status_code) except Exception as e: print('小链接连接失败') if res.status_code == 200: soup = BeautifulSoup(res.text, 'lxml') title = soup.select('div.left-wrapper > h2')[0].text try: video_src = soup.select('div.video-wrapper > video')[0]['src'] except Exception as e: print('此篇为文章,不是视频') thumb_src = soup.select('div.video-wrapper > video')[0]['poster'] try: source = soup.select('body.page-article .left-wrapper > .meta > a')[0].text except Exception as e: source = soup.select('body.page-article .left-wrapper > .meta > span')[0].text source_re = re.sub('来源:', '', str(source)) source = source_re publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[0].text try: timestamp = self.time_cycle(str(publishtime)) except Exception as e: timestamp = now img_url_parts = urlparse.urlparse(thumb_src) img_url_query = urlparse.parse_qs(img_url_parts.query,True) if img_url_query.has_key('wx_fmt'): ext_name = '.' + img_url_query['wx_fmt'][0] else: ext_name = '.png' thumb_p = self.thumb_path + 'video/thumb' if not os.path.exists(thumb_p): os.mkdir(thumb_p) img_down_local_path = thumb_p + '/' + psw[:20] + ext_name urllib.urlretrieve(thumb_src, img_down_local_path) file_name = psw[:20] + ext_name if os.path.exists(img_down_local_path): images_path = self.ossConf['video_thumb_path'] status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path) thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name m = hashlib.md5() m.update(str(thumb_src)) psw_thumb = m.hexdigest() try: delay_re = re.compile('"duration":(\d+)') playtime = delay_re.findall(str(res.text))[0] except Exception as e: print(traceback.format_exc()) playtime = None video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-1] video_res = requests.get(video_src).content with open(video_path, 'wb') as f: f.write(video_res) try: clip = VideoFileClip(video_path) print(clip.duration) except Exception as e: print(traceback.format_exc()) return False # with open(video_path, 'r') as f: # length = len(f.read()) # if length < 819200: # return False video_name = str(video_src).split('/')[-1][10:] if os.path.exists(video_path): images_path = self.ossConf['video_path'] status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path) print('') if status != 'success': return False print('视频上传成功') video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name try: sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')" params = (title, thumb_src, now, psw_thumb) self.cursor.execute(sql, params) self.conn.commit() topicid = self.cursor.lastrowid except Exception as e: print(traceback.format_exc()) self.conn.rollback() sourceid = self.get_article_sourceid(source) try: result = self.cursor.execute(""" insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby) values(%s, %s, %s, %s, %s, %s, %s, %s, 60, %s, %s, %s, %s, %s, %s) """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0)) self.conn.commit() lastrowid = self.cursor.lastrowid except Exception as e: print(traceback.format_exc()) self.conn.rollback() video_id = str(video_src).split('/')[-1][10:-4] try: sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)" print(sql) params = (lastrowid, video_link, playtime, source, video_id, 0) self.cursor.execute(sql, params) self.conn.commit() except Exception as e: print(traceback.format_exc()) self.conn.rollback() self.redisServer.sadd(self.key_title, psw) api_url = self.video_publish + str(lastrowid) try: resp = urllib.urlopen(api_url) result = resp.read() except: print 'connect failed' else: print('一点资讯视频主链接请求失败,请及时查看原因') # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因' # res = requests.post(self.msg_api, data=self.form_data) def get_article_sourceid(self, source, medias = []): source = source.strip() sourceid = 0 """ print source print set([source.encode('utf-8')]) print medias """ result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"') has_signed_contract = 0 if medias and (set([source.encode('utf-8')]) & medias): has_signed_contract = 1 if result: data = self.cursor.fetchone() sourceid = data[0] if data[2] != has_signed_contract: try: result = self.cursor.execute(""" update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s """, (has_signed_contract, sourceid)) self.conn.commit() except: self.conn.rollback() else: try: result = self.cursor.execute(""" insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`) values(%s, %s, %s, %s, %s) """, (source, '', '', '', has_signed_contract)) self.conn.commit() sourceid = self.cursor.lastrowid except: self.conn.rollback() return sourceid def run(self): os.system('pkill phantomjs') lockConf = IConfig.load('resource.lock') lock_file = lockConf['lock_path_ydzx'] if os.path.exists(lock_file): print('lock file exists') return False os.system(r'touch %s '% lock_file) self.store_video_list_redis(ydzx_url_list) get_video_para = self.get_video_para() for psw, itemid in get_video_para: print(psw) stop_time = int(time.time()) balance_time = stop_time - self.start_time if balance_time >= 10800: #运行时间为3个小时 self.del_file(self.thumb_path + 'video') os.system(r'rm -rf %s' % lock_file) os._exit(0) try: self.download_video(psw=psw, itemid=itemid) time.sleep(random.uniform(2, 8)) os.system('pkill ffmpeg-osx-v3.2.4') except Exception as e: print(traceback.format_exc()) continue self.del_file(self.thumb_path + 'video') os.system(r'rm -rf %s' % lock_file) def video_publish(self): sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851' self.cursor.execute(sql) data = self.cursor.fetchall() for num in data: api_url = self.video_publish + str(num[0]) try: resp = urllib.urlopen(api_url) result = resp.read() except: print 'connect failed' def del_file(self, path): os.chdir(path) #进入要清空的目录 ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮 for d in ds: #遍历该列表 if os.path.isfile(d): #如果列表项是文件 os.remove(d) #直接删除 else: #如果不会文件 shutil.rmtree(d) #也直接删除 if __name__ == '__main__': video_one = WxpnVideo() # video_two = WxpnVideo() video_one.start() # video_two.start() video_one.join() # video_two.join()