爬取豆瓣电影信息保存到Excel

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import html.parser
 4 from openpyxl import Workbook,load_workbook
 5 import os
 6 class DouBan(object):
 7 
 8     def __init__(self):
 9         self.url = 'https://movie.douban.com/'
10         self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
11 
12     def openUrl(self, url):
13         response = requests.get(url,headers=self.header)
14         return response
15 
16     def getUrl(self):
17         response = self.openUrl(self.url)
18         douban_html = response.text
19         # print(douban_html)
20         soup = BeautifulSoup(douban_html,'html.parser')
21         hrefs = soup.select("li.poster > a")
22         return hrefs
23         # for href in hrefs:
24         #     print(href['href']
25     def getMsg(self):
26         hrefs = self.getUrl()
27         for num,href in enumerate(hrefs):
28             msg_list = []
29             print(href['href'])
30             response = self.openUrl(href['href'])
31             html_mover = response.text
32             soup = BeautifulSoup(html_mover,'html.parser')
33             all_info = soup.select('div#content')
34             # print(all_info)
35             title = all_info[0].select('h1')[0].text.replace('\n','')
36             msg_list.append(title)
37             # print(title)
38             info = all_info[0].select('#info')[0].text
39             msg_list.append(info)
40             # print(info)
41             describe = all_info[0].select('div#link-report span')[0].text.replace(' ','')
42             msg_list.append(describe)
43             # print(describe)
44             # return title,info,describe
45             for col in range(3):
46                 self.saveMsg(num+1, col+1,  msg_list[col])
47 
48     def saveMsg(self, row_, column_,msg):
49         # msg = self.getMsg()
50         # a = os.path.exists('//move_msg.xlsx')
51         # if a=False:
52         #     os.mkdir('move_msg.xlsx')
53         
54         wb = load_workbook('move_msg.xlsx')
55         sheet = wb.active
56         sheet.cell(row=row_, column=column_).value = msg
57         wb.save('move_msg.xlsx')
58 
59 
60 
61 
62 if __name__ == "__main__":
63     db = DouBan()
64     db.getMsg()

 

posted @ 2017-09-04 17:34  RoyFans  阅读(1123)  评论(0编辑  收藏  举报