爬虫常用设置
1. setting.py
1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 """ 4 @author: yugengde 5 @contact: yugengde@163.com 6 @file : settings.py 7 @time: 2017/11/22 15:41 8 """ 9 10 BOT_NAME = 'pro' 11 12 SPIDER_MODULES = ['pro.spiders'] 13 NEWSPIDER_MODULE = 'pro.spiders' 14 15 ROBOTSTXT_OBEY = False 16 17 DOWNLOAD_DELAY = 3 18 COOKIES_ENABLED = False 19 20 DOWNLOADER_MIDDLEWARES = { 21 'pro.middlewares.PhantomJSMiddleware': 301, 22 'pro.middlewares.UserAgentMiddleware': 300, 23 } 24 25 ITEM_PIPELINES = { 26 'scrapy_redis.pipelines.RedisPipeline': 301, 27 'pro.pipelines.DuplicatesPipeline': 300, 28 } 29 30 LOG_ENABLED = True 31 LOG_ENCODING = 'utf-8' 32 LOG_FILE = 'pro.log' 33 LOG_LEVEL = 'DEBUG' 34 # LOG_STDOUT = 35 36 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 37 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 38 REDIS_URL = 'redis://root:password@localhost:6379'
2. middlewares.py
1 class PhantomJSMiddleware(object): 2 @classmethod 3 def process_request(cls, request, spider): 4 from selenium import webdriver 5 from scrapy.http import HtmlResponse 6 driver = webdriver.PhantomJS(r'C:\InstallFile\Phantomjs\bin\phantomjs.exe') 7 driver.get(request.url) 8 content = driver.page_source.encode('utf-8') 9 driver.quit() 10 11 return HtmlResponse(request.url, encoding='utf-8', body=content, request=request) 12 13 14 class UserAgentMiddleware(object): 15 @classmethod 16 def process_request(cls, request, spider): 17 import random 18 user_agents = [
20 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 21 "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", # 可以使用 UserAgent()函数生成 ] 294 request.headers.setdefault('UserAgent',random.choice(user_agents))
3. pipelines.py
1 #!/usr/bin/python 2 # -*-coding:utf-8-*- 3 4 from scrapy.exceptions import DropItem 5 6 7 # 数据的去重 8 class DuplicatesPipeline(object): 9 def __init__(self): 10 self.ids_seen = set() 11 12 def process_item(self, item, spider): 13 if not item['title']: 14 raise DropItem("Missing title in %s " % item) 15 16 if item['item_id'] in self.ids_seen: 17 raise DropItem("Duplicate item found: %s" % item) 18 else: 19 self.ids_seen.add(item['item_id']) 20 yield item