Python-爬虫-基本库(requests)使用-抓取猫眼电影Too100榜
1 #抓取猫眼电影,https://maoyan.com/board/4 榜单电影列表 2 import requests 3 import re 4 from requests.auth import HTTPBasicAuth 5 6 #定义爬虫工具类 7 class SpiderTools(): 8 def __init__(self): 9 super(SpiderTools, self).__init__() 10 #抓取首页信息 11 def load_onePage(self,url): 12 self.headers={ 13 'Host':'maoyan.com', 14 'Accept':'text / html, application / xhtml + xml, * / *', 15 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0' 16 } 17 res=requests.get(url,headers=self.headers) 18 #print(res.text) 19 #解析获取电影名称排行,名称,图片地址,主演,上映时间 20 pattern=re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<p class="name".*?><a.*?>(.*?)</a>.*?<img data-src="(.*?)".*?>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>',re.S) 21 result=re.findall(pattern,res.text) 22 #将数据创建字典 23 #print(result0) 24 items=[] 25 for i in result: 26 dict={ 27 "order":i[0], 28 "name":i[1], 29 "imageURL":i[2], 30 "auth":i[3].strip(),#strip去掉前后换行符合空格 31 "time":i[4] 32 } 33 items.append(dict) 34 return items 35 36 37 38 if __name__=="__main__": 39 spider=SpiderTools() 40 i=0 41 while True: 42 items = spider.load_onePage('https://maoyan.com/board/4?offset=%d'%(i)) 43 i=i+10 44 if not len(items): 45 break; 46 print(i,items)
上面程序通过yield返回迭代器,修改如下:
1 #抓取猫眼电影,https://maoyan.com/board/4 榜单电影列表 2 import requests 3 import re 4 from requests.auth import HTTPBasicAuth 5 6 #定义爬虫工具类 7 class SpiderTools(): 8 def __init__(self): 9 super(SpiderTools, self).__init__() 10 #抓取首页信息 11 def load_onePage(self,url): 12 self.headers={ 13 'Host':'maoyan.com', 14 'Accept':'text / html, application / xhtml + xml, * / *', 15 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0' 16 } 17 res=requests.get(url,headers=self.headers) 18 #print(res.text) 19 #解析获取电影名称排行,名称,图片地址,主演,上映时间 20 pattern=re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<p class="name".*?><a.*?>(.*?)</a>.*?<img data-src="(.*?)".*?>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>',re.S) 21 result=re.findall(pattern,res.text) 22 #将数据创建字典 23 #print(result0) 24 25 for i in result: 26 yield {#返回一个生成器(迭代器) 27 "order":i[0], 28 "name":i[1], 29 "imageURL":i[2], 30 "auth":i[3].strip(),#strip去掉前后换行符合空格 31 "time":i[4] 32 } 33 34 if __name__=="__main__": 35 spider=SpiderTools() 36 #该榜单一共10页 37 for i in range(10): 38 items = spider.load_onePage('https://maoyan.com/board/4?offset=%d'%(i)) 39 #由于最终上面方法结果为一个生成器(迭代器),因此无法判断是否为空 40 for a in items: 41 print(a)