Scrapy抓取jobbole数据
1、python版本3.6.1
2、python编辑器:JetBrains PyCharm
2、安装virtualenvwrapper-win
pip3 install virtualenvwrapper-win
3、创建项目
1 mkvirtualenv spider_article 2 pip install C:\Users\CR\Downloads\Twisted-17.5.0-cp36-cp36m-win_amd64.whl 3 pip install pypiwin32 4 pip install -i https://pypi.douban.com/simple/ scrapy 5 pip install mysqlclient 6 pip install pillow
4、现在项目存放位置:
1、打开cmd
2、workon spider_article
3、scrapy startproject ArticleSpider
4、cd ArticleSpider
5、scrapy genspider jobbole blog.jobbole.com
5、ArticleSpider文件夹下面创建调试文件
1 from scrapy.cmdline import execute 2 3 import sys 4 import os 5 6 sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 execute(["scrapy","crawl","jobbole"])
6、主要代码
jobbole.py文件内容
1 import scrapy 2 import re 3 import datetime 4 from scrapy.http import Request 5 from urllib import parse #python2中import urlparse 6 from scrapy.loader import ItemLoader 7 8 9 from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader 10 11 from ArticleSpider.utils.common import get_md5 12 13 14 class JobboleSpider(scrapy.Spider): 15 name = 'jobbole' 16 allowed_domains = ['blog.jobbole.com'] 17 start_urls = ['http://blog.jobbole.com/all-posts/'] 18 19 def parse(self, response): 20 21 ''' 22 1、获取文章列表页面中的文章url并交给scrapy下载后解析 23 2、获取下一页的url并交给scrapy进行下载,下载完成后交给parse 24 :param response: 25 :return: 26 ''' 27 #extract 一旦执行就会返回一个数组 28 post_nodes = response.css("#archive .floated-thumb .post-thumb a") 29 for post_node in post_nodes: 30 image_url = post_node.css("img::attr(src)").extract_first("") 31 #取出当前文章的域名 32 #Request(url=post_url,callback=self.parse_detail) 33 post_url = post_node.css("::attr(href)").extract_first("") 34 #parse.urljoin post_url如果没有域名就从response中提取域名防进行;如果post_url有域名,response就不会起作用 35 yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url}, callback=self.parse_detail) 36 37 #提取下一页并交给scrapy进行下载 38 next_urls = response.css(".next.page-numbers::attr(href)").extract_first("") 39 if next_urls: 40 yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse) 41 42 def parse_detail(self,response): 43 ''' 44 获取文章的具体字段 45 :param response: 46 :return: 47 ''' 48 front_image_url = response.meta.get("front_image_url", "") #文章封面图 49 item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response) 50 item_loader.add_css("title",".entry-header h1::text") 51 item_loader.add_value("url", response.url) 52 item_loader.add_value("front_image_url",[front_image_url]) 53 item_loader.add_value("url_object_id",get_md5(response.url)) 54 item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") 55 item_loader.add_css("praise_nums", ".vote-post-up h10::text") 56 item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") 57 item_loader.add_css("fav_nums", ".bookmark-btn::text") 58 item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") 59 item_loader.add_css("content", "div.entry") 60 article_item = item_loader.load_item() 61 yield article_item
2、items.py文件
1 import datetime 2 import re 3 4 import scrapy 5 from scrapy.loader import ItemLoader 6 from scrapy.loader.processors import MapCompose,TakeFirst,Join 7 8 9 class ArticlespiderItem(scrapy.Item): 10 # define the fields for your item here like: 11 # name = scrapy.Field() 12 pass 13 14 def date_convert(vlaue): 15 try: 16 create_date = datetime.datetime.strptime(vlaue,"%Y/%m/%d").date() 17 except Exception as e: 18 create_date = datetime.datetime.now().date() 19 return create_date 20 21 22 def get_nums(value): 23 match_re = re.match(".*?(\d+).*", value) 24 if match_re: 25 nums = int(match_re.group(1)) 26 else: 27 nums = 0 28 return nums 29 30 def remove_comment_tags(value): 31 #去掉tag中提取的评论 32 if "评论" in value: 33 return "" 34 else: 35 return value 36 37 def return_value(value): 38 return value 39 40 class ArticleItemLoader(ItemLoader): 41 #自定义ItemLoader 42 default_output_processor = TakeFirst() 43 44 45 class JobBoleArticleItem(scrapy.Item): 46 title = scrapy.Field() 47 # title = scrapy.Field( 48 # input_processor = MapCompose(lambda x:x+'-jobbole') 49 # ) 50 create_date = scrapy.Field( 51 input_processor = MapCompose(date_convert), 52 #output_processor = TakeFirst(),#只取第一个 53 ) 54 url = scrapy.Field() 55 url_object_id = scrapy.Field() 56 front_image_url = scrapy.Field() 57 front_image_path = scrapy.Field( 58 input_processor=MapCompose(return_value) 59 ) 60 praise_nums = scrapy.Field( 61 input_processor = MapCompose(get_nums) 62 ) 63 comment_nums = scrapy.Field( 64 input_processor = MapCompose(get_nums) 65 ) 66 fav_nums = scrapy.Field( 67 input_processor = MapCompose(get_nums) 68 ) 69 tags = scrapy.Field( 70 input_processor=MapCompose(remove_comment_tags), 71 output_processor=Join(",") 72 ) 73 content = scrapy.Field()
3、piplines.py文件
import codecs import json from scrapy.pipelines.images import ImagesPipeline from scrapy.exporters import JsonItemExporter from twisted.enterprise import adbapi #adbapi可以将MySQLdb的一些操作变成异步化的操作 import MySQLdb import MySQLdb.cursors class ArticlespiderPipeline(object): def process_item(self, item, spider): return item class JsonWithEncodingPipeline(object): #自定义json文件的导出 def __init__(self): self.file = codecs.open('article.json','w',encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item),ensure_ascii=False) + '\n' self.file.write(lines) return item def spider_closed(self,spider): self.file.close() class MysqlPipeline(object): def __init__(self): self.conn = MySQLdb.connect("localhost","root","","article_spider",charset="utf8",use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): inset_sql = ''' INSERT INTO article (title,url,create_date,fav_nums) VALUES (%s,%s,%s,%s) ''' self.cursor.execute(inset_sql,(item['title'],item['url'],item['create_date'],item['fav_nums'])) self.conn.commit() class MysqlTwistedPipeline(object): def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls,settings): dbparms = dict ( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset = 'utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True, ) dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms) return cls(dbpool) def process_item(self, item, spider): ''' 使用Twisted将mysql插入变成异步执行 :param item: :param spider: :return: ''' query = self.dbpool.runInteraction(self.do_insert,item) query.addErrback(self.handle_error,item,spider)#处理异常 def handle_error(self,failure,item,spider): #处理异步插入的异常 print(failure) def do_insert(self,cursor,item): #执行具体的插入 inset_sql = ''' INSERT INTO article (title,url,create_date,fav_nums) VALUES (%s,%s,%s,%s) ''' cursor.execute(inset_sql, (item['title'], item['url'], item['create_date'], item['fav_nums'])) class JsonExporterPipleline(object): #调用scrapy提供的json export导出json文件 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item class ArticleImagePipline(ImagesPipeline): def item_completed(self, results, item, info): if "front_image_path" in item: for ok,value in results: image_file_path = value["path"] item["front_image_path"] = image_file_path return item
4、创建公共函数(位置存放ArticleSpider/utils/common.py)
1 import hashlib 2 3 def get_md5(url): 4 if isinstance(url,str): 5 url = url.encode("utf-8") 6 m = hashlib.md5() 7 m.update(url) 8 return m.hexdigest() 9 10 if __name__ == '__main__': 11 print(get_md5("http://jobbole.com"))
5、配置settings文件
import os ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline':1,#图片下载 'ArticleSpider.pipelines.MysqlTwistedPipeline': 3, } IMAGES_URLS_FIELD = 'front_image_url' project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir,"images") #指定图片存储路径 MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'article_spider' MYSQL_USER = 'root' MYSQL_PASSWORD = ''
posted on 2018-01-21 18:15 LOVESTYUDY 阅读(187) 评论(0) 编辑 收藏 举报