19 03 12 环球网 三级页面跳转加翻页 数据抓取 打印(无数据库版)
这次先没有数据库
spider
# -*- coding: utf-8 -*- import scrapy # from yang_guan.items import YangGuanItem from copy import deepcopy from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider): name = 'yg' # allowed_domains = ['huanqiu.com'] start_urls = ['http://www.huanqiu.com/', ] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a") print(class_news_urls_li) for class_news_url in class_news_urls_li: item["class_tittle"] = class_news_url.xpath("./text()").extract_first() print(item) new_url = class_news_url.xpath("./@href").extract_first() print(new_url) yield scrapy.Request( new_url, callback=self.second_class, meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item ) def second_class(self, response): # 二级页面 item = response.meta["item"] print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request( secoond_news_url, callback=self.parse_detail_analyze, meta={"item": deepcopy(item)} ) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/' item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list: # item = YangGuanItem() item["title"] = li.xpath("./h3/a/text()").extract_first() item["img_url"] = li.xpath("./a/img/@src").extract_first() item["detail"] = li.xpath("./h5/text()").extract_first() yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})
关于setting
# -*- coding: utf-8 -*- import scrapy # from yang_guan.items import YangGuanItem from copy import deepcopy from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider): name = 'yg' # allowed_domains = ['huanqiu.com'] start_urls = ['http://www.huanqiu.com/', ] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a") print(class_news_urls_li) for class_news_url in class_news_urls_li: item["class_tittle"] = class_news_url.xpath("./text()").extract_first() print(item) new_url = class_news_url.xpath("./@href").extract_first() print(new_url) yield scrapy.Request( new_url, callback=self.second_class, meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item ) def second_class(self, response): # 二级页面 item = response.meta["item"] print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request( secoond_news_url, callback=self.parse_detail_analyze, meta={"item": deepcopy(item)} ) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/' item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list: # item = YangGuanItem() item["title"] = li.xpath("./h3/a/text()").extract_first() item["img_url"] = li.xpath("./a/img/@src").extract_first() item["detail"] = li.xpath("./h5/text()").extract_first() yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})