爬虫学习-使用CrawlSpider
使用scrapy中的CrawlSpider类来进行爬行
一直用的是BaseSpider,回调函数的方式,有一个问题是title,date在一个页面,author,detail在另一个页面时,怎么把这些字段统一在一个item条目中,尝试了多次,用全局变量等,未果。
尝试使用更高级的CrawlSpider操作;
参照实例代码:
1 from scrapy.selector import Selector
2 from scrapy.http import Request
3 #from scrapy.contrib.spiders import CrawlSpider
4 from scrapy.spiders import CrawlSpider
5 from scrapy.loader import ItemLoader
6 from scrapy.linkextractors.sgml import SgmlLinkExtractor
7 from bbsdmoz.items import BbsdmozItem
8
9 class formSpider(CrawlSpider):
10 name='bbsSpider'
11 allow_domain=['bbs.sjtu.edu.cn']
12 start_urls=['https://bbs.sjtu.edu.cn/bbsall']
13 link_extractor={
14 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
15 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
16 'content':SgmlLinkExtractor(allow='/bbscon,board,\w+,file,M\.\d+\.A\.html$'),
17 }
18 _x_query={
19 'page_content':'//pre/text()[2]',
20 'poster':'//pre/a/text()',
21 'forum':'//center/text()[2]',
22 }
23
24 def parse(self,response):
25 for link in self.link_extractor['page'].extract_links(response):
26 yield Request(url=link.url,callback=self.parse_page)
27
28 def parse_page(self,response):
29 for link in self.link_extractor['page_down'].extract_links(response):
30 yield Request(url=link.url,callback=self.parse_page)
31
32 for link in self.link_extractor['content'].extract_links(response):
33 yield Request(url=link.url,callback=self.parse_content)
34
35 def parse_content(self,response):
36 bbsItem_loader=ItemLoader(item=BbsdmozItem(),response=response)
37 url=str(response.url)
38 bbsItem_loader.add_value('url',url)
39 bbsItem_loader.add_xpath('forum',self._x_query['forum'])
40 bbsItem_loader.add_xpath('poster',self._x_query['poster'])
41 bbsItem_loader.add_xpath('content',self._x_query['page_content'])
2 from scrapy.http import Request
3 #from scrapy.contrib.spiders import CrawlSpider
4 from scrapy.spiders import CrawlSpider
5 from scrapy.loader import ItemLoader
6 from scrapy.linkextractors.sgml import SgmlLinkExtractor
7 from bbsdmoz.items import BbsdmozItem
8
9 class formSpider(CrawlSpider):
10 name='bbsSpider'
11 allow_domain=['bbs.sjtu.edu.cn']
12 start_urls=['https://bbs.sjtu.edu.cn/bbsall']
13 link_extractor={
14 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
15 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
16 'content':SgmlLinkExtractor(allow='/bbscon,board,\w+,file,M\.\d+\.A\.html$'),
17 }
18 _x_query={
19 'page_content':'//pre/text()[2]',
20 'poster':'//pre/a/text()',
21 'forum':'//center/text()[2]',
22 }
23
24 def parse(self,response):
25 for link in self.link_extractor['page'].extract_links(response):
26 yield Request(url=link.url,callback=self.parse_page)
27
28 def parse_page(self,response):
29 for link in self.link_extractor['page_down'].extract_links(response):
30 yield Request(url=link.url,callback=self.parse_page)
31
32 for link in self.link_extractor['content'].extract_links(response):
33 yield Request(url=link.url,callback=self.parse_content)
34
35 def parse_content(self,response):
36 bbsItem_loader=ItemLoader(item=BbsdmozItem(),response=response)
37 url=str(response.url)
38 bbsItem_loader.add_value('url',url)
39 bbsItem_loader.add_xpath('forum',self._x_query['forum'])
40 bbsItem_loader.add_xpath('poster',self._x_query['poster'])
41 bbsItem_loader.add_xpath('content',self._x_query['page_content'])
42 return bbsItem_loader.load_item()
稍加改造后,如下代码:
1 class MySpider6(CrawlSpider):
2 name = "myspider6"
3 allowed_domains = ["10.60.32.179"]
4 start_urls = [
5 'http://10.60.32.179/Site/Site1/myindex.shtml',
6 #'http://example.com/page2',
7 ]
8 link_extractor={
9 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
10 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
11 'page':SgmlLinkExtractor(allow='/Article/\w+\/\w+\.shtml$'),
12 }
13
14 _x_query={
15 'date':'span/text()',
16 'title':'a/text()',
17 }
18 _y_query={
19 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
20 }
21
22 def parse(self,response):
23 for link in self.link_extractor['page'].extract_links(response):
24 yield Request(url=link.url,callback=self.parse_content)
25
26
27
28 def parse_content(self,response):
29 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
30 url=str(response.url)
31 bbsItem_loader.add_value('desc',url)
32 bbsItem_loader.add_xpath('title',self._x_query['title'])
33 bbsItem_loader.add_xpath('date',self._x_query['date'])
34 bbsItem_loader.add_xpath('detail',self._y_query['detail'])
2 name = "myspider6"
3 allowed_domains = ["10.60.32.179"]
4 start_urls = [
5 'http://10.60.32.179/Site/Site1/myindex.shtml',
6 #'http://example.com/page2',
7 ]
8 link_extractor={
9 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
10 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
11 'page':SgmlLinkExtractor(allow='/Article/\w+\/\w+\.shtml$'),
12 }
13
14 _x_query={
15 'date':'span/text()',
16 'title':'a/text()',
17 }
18 _y_query={
19 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
20 }
21
22 def parse(self,response):
23 for link in self.link_extractor['page'].extract_links(response):
24 yield Request(url=link.url,callback=self.parse_content)
25
26
27
28 def parse_content(self,response):
29 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
30 url=str(response.url)
31 bbsItem_loader.add_value('desc',url)
32 bbsItem_loader.add_xpath('title',self._x_query['title'])
33 bbsItem_loader.add_xpath('date',self._x_query['date'])
34 bbsItem_loader.add_xpath('detail',self._y_query['detail'])
35 return bbsItem_loader.load_item()
run it,success.
D:\test-python\tutorial>\Python27\Scripts\scrapy.exe crawl myspider6 -o ee.json
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步