scrapy爬取极客学院全部课程

 1 # -*- coding: utf-8 -*-
 2 # scrapy爬取极客学院全部课程
 3 import scrapy
 4 from pyquery import PyQuery as pq
 5 from jike.items import JikeItem
 6 
 7 class JikespiderSpider(scrapy.Spider):
 8     name = "jikespider"
 9     allowed_domains = ["www.jikexueyuan.com"]
10     base_url = 'http://www.jikexueyuan.com/course/?pageNum='
11 
12     def start_requests(self):
13         for page_num in range(1,96):
14             url = self.base_url + str(page_num)
15             yield scrapy.Request(url, callback=self.parse_index)
16 
17     def parse_index(self, response):
18         doc = pq(response.text)
19         lis = doc('.lesson-list .cf li').items()
20         # pyquery心得, 以为pyquery有点问题而导致无法遍历数据结构,
21         # 研究发现是'http:' + item('.lessonimg-box a').attr('href')
22         # 的问题, href是相对路径没有得到一个有效的请求链接
23         for item in lis:
24             detail_url = 'http:' + item('.lessonimg-box a').attr('href')
25             yield scrapy.Request(url=detail_url,callback=self.parse_detail)
26 
27     def parse_detail(self, response):
28         item = JikeItem()
29         doc = pq(response.text)
30         item['title'] = doc('.lesson-teacher .bc-box h2').text()
31         item['time'] = doc('.lesson-teacher .bc-box .timebox').text()
32         item['content'] = doc('.lesson-teacher .infor-content').text()
33 
34         yield item

 

posted @ 2017-06-25 18:12  道高一尺  阅读(560)  评论(0编辑  收藏  举报