1.创建scrapy工程:scrapy startproject projectName
2.创建爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CrawldemoSpider(CrawlSpider): name = 'qiubai' #allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/pic/'] #连接提取器:会去起始url响应回来的页面中提取指定的url link = LinkExtractor(allow=r'/pic/page/\d+\?') #s=为随机数 link1 = LinkExtractor(allow=r'/pic/$')#爬取第一页 #rules元组中存放的是不同的规则解析器(封装好了某种解析规则) rules = ( #规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析 Rule(link, callback='parse_item', follow=True), Rule(link1, callback='parse_item', follow=True), ) def parse_item(self, response): print(response)