1 # 请求网页我使用的是requests库,最开始我试探性的直接使用get函数请求网页内容,请求代码如下: 2 3 # import requests 4 5 # url0 = 'https://movie.douban.com/top250?start=' 6 # url1 = 0 7 # url2 = '&filter=' 8 # 9 # # reponse = requests.get(url0+str(url1)+url2) 10 # # 响应的代码是418,说明请求未成功 11 # 12 # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ 13 # AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} 14 # reponse = requests.get(url0+str(url1)+url2,headers = headers) 15 # # 需要获取浏览器的User-Agent参数,对于Chrome浏览器,可以直接在浏览器中输入about:version 16 # # 将User-Agent传递给了get函数的headers参数,结果返回200,说明请求成功 17 # 18 # print(reponse.status_code) 19 # =================================================================================================== 20 from bs4 import BeautifulSoup 21 import requests 22 import openpyxl 23 24 def HTMLDownloader(page): 25 url = 'https://movie.douban.com/top250?start='+ str(page*25) +'&filter=' 26 #print(url) 27 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ 28 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} 29 try: 30 reponse = requests.get(url,headers = headers) 31 if reponse.status_code == 200:#响应成功 32 return reponse.text 33 except requests.RequestException as e: 34 return None 35 36 def HTMLParser(reponse): 37 """ 38 网页解析出的电影数据都保存在列表中 39 列表的每个元素都包含一部电影的信息 40 """ 41 outcome = [] 42 if reponse == None: 43 pass 44 else: 45 soup = BeautifulSoup(reponse,'lxml') #BeautifulSoup构造方法 46 #寻找所有电影的div标签 47 items = soup.findAll('div',attrs={'class':'item'}) 48 #从div标签中获取电影的各种信息 49 for item in items: 50 title = item.find('span',attrs = {'class':'title'}).string 51 rank = item.find('span',attrs = {'class':'rating_num'}).string 52 movinfo = item.find('p',attrs = {'class':''}).text 53 #有的电影没有评论 54 if item.find('span',attrs = {'class':'inq'}): 55 comment = item.find('span',attrs = {'class':'inq'}).string 56 else: 57 comment = '' 58 print('clawing',title,rank,movinfo,comment) 59 outcome.append([title,rank,movinfo.strip(),comment.strip()]) 60 return outcome 61 62 def MovieInfoSaver(mov): 63 """ 64 保存爬虫结果到excel中 65 """ 66 workbook = openpyxl.Workbook() 67 sheet = workbook.active 68 sheet.title = 'douban_movie' 69 sheet.cell(1,1,'电影名') 70 sheet.cell(1,2,'豆瓣评分') 71 sheet.cell(1,3,'简介') 72 sheet.cell(1,4,'评论') 73 for idx,m in enumerate(mov): 74 sheet.cell(idx + 2,1,m[0]) 75 sheet.cell(idx + 2,2,m[1]) 76 sheet.cell(idx + 2,3,m[2]) 77 sheet.cell(idx + 2,4,m[3]) 78 79 workbook.save(u'douban_hrank_movie.xlsx') # 保存工作簿 80 81 if __name__ == "__main__": 82 movinfo = [] 83 #通过page变量请求10个页面 84 for page in range(10): 85 reponse = HTMLDownloader(page) 86 movinfo += HTMLParser(reponse) 87 MovieInfoSaver(movinfo)
#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=
本文来自博客园,作者:凡是过去,皆为序曲,转载请注明原文链接:https://www.cnblogs.com/longhai3/p/15887907.html
如有疑问,欢迎提问
#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=