9.python爬虫--pyspider
pyspider简介
PySpider:一个国人编写的强大的网络爬虫系统并带有强大的WebUI。采用Python语言编写,分布式架构,支持多种数据库后端,强大的WebUI支持脚本编辑器,任务监视器,项目管理器以及结果查看器。在线示例:http://demo.pyspider .org/,学习教程:http://www.pyspider.cn/page/1.html
项目需求:
爬去 http://www.adquan.com/ 上面的文章供公司内部学习
简单实现,初步版(命名规范忽略):
# -*- coding: utf-8 -*- #__author:jiangjing #date:2018/2/2 # !/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2017-12-07 13:40:43 # Project: adquan from pyspider.libs.base_handler import * import os import urlparse import datetime import re import requests import uuid UPLOAD_IMAGE_URL = "http://10.32.64.194:8233/api/NoLogin/UploadImage" #上传图片至目标服务器 ADD_WEIBO_ARTICLE_URL = "http://10.32.64.194:8233/api/NoLogin/AddDraft" #把当前文章添加到草稿 WEIBO_IMAGE_URL = "/upload/image" #地址拼接用 PLAY_VIDEO_URL = "http://10.32.64.196:10001/hls/video/catch" #播放视频的地址 IMAGE_DIR_PATH = "/var/hls/image/catch" #图片存放地址 VIDEO_DIR_PATH = "/var/hls/video/catch" #视频存放地址 class Handler(BaseHandler): crawl_config = { } def __init__(self): self.deal = Deal() @every(minutes=24 * 60 * 3) def on_start(self): self.crawl('http://www.adquan.com', callback=self.index_page) @config(age=100 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('.work_list_left .w_l_inner_img a').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): today_str = datetime.date.today().strftime("%Y-%m-%d") # 抓取封面 cover_guid = '' for img in response.doc('.con_pic_title img').items(): url = img.attr.src if url: image_path = self.deal.getImageDirPath() extension = self.deal.getExtension(url) guid = str(uuid.uuid1()).replace('-', '') file_name = "origin_" + guid + '.' + extension file_path = image_path + '/' + file_name content = requests.get(url).content self.deal.saveImg(content, file_path) self.upload_image_to_weibo(file_path, guid, file_name) cover_guid = guid # 爬取图片 for img in response.doc('.con_Text img').items(): url = img.attr.src if url: extension = self.deal.getExtension(url) guid = str(uuid.uuid1()).replace('-', '') file_name = "origin_" + guid + '.' + extension self.crawl(img.attr.src, callback=self.save_img, save={'file_name': file_name, 'guid': guid}) img.attr.src = '%s/%s/%s' % (WEIBO_IMAGE_URL, datetime.date.today().strftime("%Y%m%d"), file_name) # 抓取视频 for video in response.doc('.con_Text iframe').items(): width = video.attr.width if not width: width = 600 iframe_url = str(video.attr('data-src')).strip() if not video.attr('data-src'): iframe_url = str(video.attr.src).strip() if not iframe_url: continue ret = urlparse.urlparse(iframe_url) vids = re.findall('vid=(\d|\w*)&?', ret.query) if not vids or not vids[0].strip(): logger.error("get video id failed, url:%s" % (url)) continue guid = str(uuid.uuid1()).replace('-', '') play_url = '%s/%s/%s.mp4' % (PLAY_VIDEO_URL, today_str, guid) cover_img= '%s/%s/%s.jpg' % (PLAY_VIDEO_URL, today_str, guid) video.replaceWith('<video controls="1" src=%s width=%s poster=%s></video>' % (play_url, str(width), cover_img)) video.attr.poster = cover_img self.download_video(vids[0].strip(), guid) if response.doc('.text_title').text() != '': html_content = response.doc('.con_Text').html() text_content = self.deal.filterTag(html_content) self.add_article_to_weibo(response.doc('.text_title').text(), html_content , text_content, 2, cover_guid) def add_article_to_weibo(self, title, content, contentText, articleType, picguid): data = {'title': title, "content": content, "contentText": contentText, "articleType": articleType, "picguid": picguid} response = requests.post(ADD_WEIBO_ARTICLE_URL, data=data) return { "add_article_to_weibo": response.text } def download_video(self, vid, guid): data = { "otype": "xml", "platform": 11, "format": 2, "vid": vid, "filename": "1.mp4", "appver": '3.2.19.333' } vurl = 'http://vv.video.qq.com/getkey'; try: response = requests.post('http://vv.video.qq.com/getkey', data=data) keys = re.findall('<key>([\s\S]+?)</key>', response.text) if len(keys) != 0: video_url = 'http://videohy.tc.qq.com/video.dispatch.tc.qq.com/%s.mp4?vkey=%s&ocid=2692093356' % ( vid, keys[0].strip()) ext = ".mp4" ret = urlparse.urlsplit(video_url) items = ret.path.split('/') today_str = datetime.date.today().strftime("%Y-%m-%d") if items[-1]: idx = items[-1].find(".") ext = items[-1][idx:] response = requests.get(video_url) self.save_vedio(response.content, guid) finally: pass def save_img(self, response): content = response.content image_path = self.deal.getImageDirPath() file_name = response.save['file_name'] guid = response.save['guid'] file_path = image_path + '/' + file_name self.deal.saveImg(content, file_path) self.upload_image_to_weibo(file_path, guid, file_name) def upload_image_to_weibo(self, file_path, guid, file_name): data = {'guid': guid, "fileName": file_name} files = {'file': open(file_path, 'rb')} response = requests.post(UPLOAD_IMAGE_URL, data=data, files=files) return { "upload_image": response.text } def save_vedio(self, content, guid): import threading ext = ".mp4" file_name = guid + ext video_path = self.deal.getVideoDirPath() file_path = video_path + '/' + file_name self.deal.saveVedio(content, file_path) os.system('ffmpeg -i %s -y -f image2 -ss 1 -vframes 1 %s' % (file_path, file_path.replace('.mp4', '.jpg'))) def cut_video(self, shell): os.system(shell) class Deal: def __init__(self): today_str = datetime.date.today().strftime("%Y-%m-%d") self.mkDir('%s/%s' % (IMAGE_DIR_PATH, today_str)) self.mkDir('%s/%s' % (VIDEO_DIR_PATH, today_str)) def getImageDirPath(self): today_str = datetime.date.today().strftime("%Y-%m-%d") return '%s/%s' % (IMAGE_DIR_PATH, today_str); def getVideoDirPath(self): today_str = datetime.date.today().strftime("%Y-%m-%d") return '%s/%s' % (VIDEO_DIR_PATH, today_str); def mkDir(self, path): path = path.strip() exists = os.path.exists(path) if not exists: os.makedirs(path) return path else: return path def saveImg(self, content, path): f = open(path, 'wb') f.write(content) f.close() def saveVedio(self, content, path): f = open(path, 'wb') f.write(content) f.close() def getExtension(self, url): extension = url.split('.')[-1] return extension #将HTML中标签等信息去掉 #@param htmlstr HTML字符串.''' def filterTag(self, htmlstr): re_cdata = re.compile('<!DOCTYPE HTML PUBLIC[^>]*>', re.I) re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) #过滤脚本 re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) #过滤style re_br = re.compile('<br\s*?/?>') re_h = re.compile('</?\w+[^>]*>') re_comment = re.compile('<!--[\s\S]*-->') s = re_cdata.sub('', htmlstr) s = re_script.sub('', s) s=re_style.sub('',s) s=re_br.sub('\n',s) s=re_h.sub(' ',s) s=re_comment.sub('',s) blank_line=re.compile('\n+') s=blank_line.sub('\n',s) s=re.sub('\s+',' ',s) s=self.replaceCharEntity(s) return s '''##替换常用HTML字符实体. #使用正常的字符替换HTML中特殊的字符实体. #你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体. #@param htmlstr HTML字符串.''' def replaceCharEntity(self, htmlstr): CHAR_ENTITIES={'nbsp':'','160':'', 'lt':'<','60':'<', 'gt':'>','62':'>', 'amp':'&','38':'&', 'quot':'"''"','34':'"'} re_charEntity=re.compile(r'&#?(?P<name>\w+);') #命名组,把 匹配字段中\w+的部分命名为name,可以用group函数获取 sz=re_charEntity.search(htmlstr) while sz: #entity=sz.group() key=sz.group('name') #命名组的获取 try: htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1) #1表示替换第一个匹配 sz=re_charEntity.search(htmlstr) except KeyError: htmlstr=re_charEntity.sub('',htmlstr,1) sz=re_charEntity.search(htmlstr) return htmlstr
posted on 2018-02-07 17:29 后端bug开发工程师 阅读(566) 评论(0) 编辑 收藏 举报