1. command
scrapy genspider your_spider_name the_domain //scrapy genspider baidu baidu.com
2. open the py file, modify the start_url and parse function
def parse(self, response): self.log('i just visited: ' + response.url) yield { 'li': response.css('.entry-content > ul > li > a::text').extract_first() }
3. save the result
scrapy runspider yourSpiderName.py -o someFileName.json
4. multiple items from a page
def parse(self, response): self.log('i just visited: ' + response.url) for article in response.css('div.article'): item = { 'title': article.css('.title::text').extract_first(), 'author': article.css('.author::text').extract_first(), 'tag': article.css('.tag::text').extract(), }
yield item
5. get the nex page url
next = response.css('li.next > a::attr(href)').extract_first() if next: next = response.urljoin(next) yield scrapy.Request(url=next, callback=self.parse)
6. scraping details from the list
def parse(self, response): urls = response.css('div.entry-content > ul > li > a::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_details) def parse_details(self, response): yield { 'title': response.css('h3.title::text').extract_first(), 'content': response.css('p.content::text').extract_first() }