[转]Scrapy递归抓取
方法1.
1 from scrapy.selector import HtmlXPathSelector 2 3 def parse(self, response): 4 hxs = HtmlXPathSelector(response) 5 items = [] 6 7 newurls = hxs.select('//a/@href').extract() 8 validurls = [] 9 for url in newurls: 10 #判断URL是否合法 11 if true: 12 validurls.append(url) 13 items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls]) 15 16 sites = hxs.select('//ul/li') 17 items = [] 18 for site in sites: 19 item = DmozItem() 20 item['title'] = site.select('a/text()').extract() 21 item['link'] = site.select('a/@href').extract() 22 item['desc'] = site.select('text()').extract() 23 items.append(item) 24 25 return items
方法2.
1 from scrapy.selector import HtmlXPathSelector 2 from sitemap.items import SitemapItem 3 4 import urllib 5 import simplejson 6 import exceptions 7 import pickle 8 9 class SitemapSpider(CrawlSpider): 10 name = 'sitemap_spider' 11 allowed_domains = ['qunar.com'] 12 start_urls = ['http://www.qunar.com/routes/'] 13 14 rules = ( 15 #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'), 16 #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'), 17 ) 18 19 def parse(self, response): 20 item = SitemapItem() 21 x = HtmlXPathSelector(response) 22 raw_urls = x.select("//a/@href").extract() 23 urls = [] 24 for url in raw_urls: 25 if 'routes' in url: 26 if 'http' not in url: 27 url = 'http://www.qunar.com' + url 28 urls.append(url) 29 30 for url in urls: 31 yield Request(url) 32 33 item['url'] = response.url.encode('UTF-8') 34 arr_keywords = x.select("//meta[@name='keywords']/@content").extract() 35 item['keywords'] = arr_keywords[0].encode('UTF-8') 36 arr_description = x.select("//meta[@name='description']/@content").extract() 37 item['description'] = arr_description[0].encode('UTF-8') 38 39 yield item
关于rule.
rules = ( #下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换) Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))), #下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换) Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"), )