爬取豆瓣电影信息保存到Excel
1 from bs4 import BeautifulSoup 2 import requests 3 import html.parser 4 from openpyxl import Workbook,load_workbook 5 import os 6 class DouBan(object): 7 8 def __init__(self): 9 self.url = 'https://movie.douban.com/' 10 self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'} 11 12 def openUrl(self, url): 13 response = requests.get(url,headers=self.header) 14 return response 15 16 def getUrl(self): 17 response = self.openUrl(self.url) 18 douban_html = response.text 19 # print(douban_html) 20 soup = BeautifulSoup(douban_html,'html.parser') 21 hrefs = soup.select("li.poster > a") 22 return hrefs 23 # for href in hrefs: 24 # print(href['href'] 25 def getMsg(self): 26 hrefs = self.getUrl() 27 for num,href in enumerate(hrefs): 28 msg_list = [] 29 print(href['href']) 30 response = self.openUrl(href['href']) 31 html_mover = response.text 32 soup = BeautifulSoup(html_mover,'html.parser') 33 all_info = soup.select('div#content') 34 # print(all_info) 35 title = all_info[0].select('h1')[0].text.replace('\n','') 36 msg_list.append(title) 37 # print(title) 38 info = all_info[0].select('#info')[0].text 39 msg_list.append(info) 40 # print(info) 41 describe = all_info[0].select('div#link-report span')[0].text.replace(' ','') 42 msg_list.append(describe) 43 # print(describe) 44 # return title,info,describe 45 for col in range(3): 46 self.saveMsg(num+1, col+1, msg_list[col]) 47 48 def saveMsg(self, row_, column_,msg): 49 # msg = self.getMsg() 50 # a = os.path.exists('//move_msg.xlsx') 51 # if a=False: 52 # os.mkdir('move_msg.xlsx') 53 54 wb = load_workbook('move_msg.xlsx') 55 sheet = wb.active 56 sheet.cell(row=row_, column=column_).value = msg 57 wb.save('move_msg.xlsx') 58 59 60 61 62 if __name__ == "__main__": 63 db = DouBan() 64 db.getMsg()