scrapy 模板

scrapy 模板:

spider.py:

import scrapy

from scrapy.http import Request
from crawl_cnblogs.crawl_cnblogs.items import ArticleItem

	
import scrapy_redis

# 了解:究竟真正的起始爬取的方法在哪?start_requests
class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'
    allowed_domains = ['www.cnblogs.com']
    start_urls = ['https://www.cnblogs.com/']
    #bro = webdriver.Chrome('./chromedriver')    # 导入驱动

    # def start_requests(self):
    #     yield Request(url='http://www.baidu.com')


    def parse(self, response):
        # print(response.text)
        div_list=response.css('div.post_item')
        for div in div_list:
            item=ArticleItem()
            title=div.xpath('./div[2]/h3/a/text()').extract_first()
            # print(title)
            item['title']=title
            author=div.xpath('./div[2]/div/a/text()').extract_first()
            # print(author)
            item['author'] = author
            desc=div.xpath('./div[2]/p/text()').extract()[-1]
            # print(desc)
            item['desc'] = desc
            url=div.xpath('./div[2]/div/span[2]/a/@href').extract_first()
            # print(url)
            item['url'] = url

            # 第一件,深度爬取爬下一页
            # 第二件:广度爬取
            # yield itme对象会去保存,Request对象会去爬取
            # callback 回调,数据爬完回来,去哪做解析,默认调用parse
            yield Request(url=url,callback=self.parse_detail,meta={'item':item})

        # css选择器取属性::attr(属性名)
        next_url='https://www.cnblogs.com'+response.css('div.pager>a:last-child::attr(href)').extract_first()
        # print(next_url)
        # 两种方式都可以
        # yield Request(url=next_url,callback=self.parse)
        yield Request(url=next_url)


    def parse_detail(self,response):
        item=response.meta.get('item')
        print(item)

        content=response.css('#post_detail').extract_first()
        item['content']=str(content)
        # print(str(content))
        yield item

    def closed(self,spider):
        print("爬虫结束,会走我关了")
        self.bro.close()

main.py

from scrapy.cmdline import execute

execute(['scrapy','crawl','xxx','--nolog'])

中间件:

# 添加selenium + user-agent池
import random
class Scrapy02DownloaderMiddleware(object):
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6"
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    def process_request(self, request, spider):
        # 请求头
        # print(request.headers)
        # request.headers['User-Agent']=random.choice(self.user_agent_list)

        # 设置cookie(并不是所有的请求,都需要带cookie,加一个判断即可)
        # 可以使用cookie池
        # print(request.cookies)
        # # import requests # 如果自己搭建cookie池,这么写
        # # ret=requests.get('127.0.0.1/get').json()['cookie']
        # # request.cookies=ret
        # request.cookies={'name':'lqz','age':18}

        # 使用代理(使用代理池)
        # print(request.meta)
        # request.meta['proxy'] = 'http://117.27.152.236:1080'
        return None
    #
    def process_response(self, request, response, spider):
        from scrapy.http import Response,HtmlResponse
        # 因为向该地址发请求,不能执行js,现在用selenium执行js,获取执行完的结果,再返回response对象
        # 要执行滑动 --》 js
        url=request.url
        spider.bro.get(url)
        page_source=spider.bro.page_source
        import time
        time.sleep(2)
        new_response=HtmlResponse(url=url,body=page_source,encoding='utf-8',request=request)
        return new_response

    # 异常处理
    # def process_exception(self, request, exception, spider):
    #     from scrapy.http import Request
    #     print('xxxx')
    #     # request.url='https://www.baidu.com/'
    #     request=Request(url='https://www.baidu.com/')
    #     return request

pip.py:

# 同步数据库
import pymysql


class MysqlArticlePipeline(object):
    def open_spider(self, spider):
        self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123",
                                    database='cnblogs', port=3306)

    def process_item(self, item, spider):
        cursor = self.conn.cursor()
        sql = "insert into article (title,author,url,`desc`,content) values ('%s','%s','%s','%s','%s')"%(item['title'],item['author'],item['url'],item['desc'],item['content'])
        cursor.execute(sql)
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.conn.close()

settings.py:

BOT_NAME = 'py1'

SPIDER_MODULES = ['py1.spiders']
NEWSPIDER_MODULE = 'py1.spiders'

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 3     # 爬取后暂停时间

COOKIES_ENABLED = False     # 不需要cookie ,则关闭

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    'Referer': '',      # 放盗链 -》 图片
    'cookie': '',
}

DOWNLOADER_MIDDLEWARES = {
   'py1.middlewares.Py1DownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
   'py1.pipelines.Py1Pipeline': 300,
}

# redis 配置:

# mysql 配置:

# 分布式爬虫的配置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300
}
posted @ 2020-04-29 16:10  black__star  阅读(331)  评论(0编辑  收藏  举报