scrapy crawl rules设置
rules = [ Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'), restrict_xpaths=('//li[@class="next_article"]')), callback='parse_item', follow=True) ] def parse_item(self, response): #print "parse_item>>>>>>" item = CsdnblogcrawlspiderItem() blog_url = str(response.url) blog_name = response.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract() item['blog_name'] = [n.encode('utf-8') for n in blog_name] item['blog_url'] = blog_url.encode('utf-8') return item