Python3.8 爬取豆瓣电影TOP250 练手爬虫
1 #!/usr/bin/env python 2 # encoding=utf-8 3 import requests 4 import re 5 import codecs 6 from bs4 import BeautifulSoup 7 from openpyxl import Workbook 8 wb = Workbook() 9 dest_filename = '电影.xlsx' 10 ws1 = wb.active 11 ws1.title = "电影top250" 12 13 DOWNLOAD_URL = 'http://movie.douban.com/top250/' 14 15 16 def download_page(url): 17 """获取url地址页面内容""" 18 headers = { 19 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' 20 } 21 data = requests.get(url, headers=headers).content 22 return data 23 24 25 def get_li(doc): 26 soup = BeautifulSoup(doc, 'html.parser') 27 ol = soup.find('ol', class_='grid_view') 28 name = [] # 名字 29 star_con = [] # 评价人数 30 score = [] # 评分 31 info_list = [] # 短评 32 desc_list = [] # 简介 33 for i in ol.find_all('li'): 34 detail = i.find('div', attrs={'class': 'hd'}) 35 movie_name = detail.find( 36 'span', attrs={'class': 'title'}).get_text() # 电影名字 37 level_star = i.find( 38 'span', attrs={'class': 'rating_num'}).get_text() # 评分 39 star = i.find('div', attrs={'class': 'star'}) 40 star_num = star.find(text=re.compile('评价')) # 评价 41 info = i.find('span', attrs={'class': 'inq'}) # 短评 42 desc = i.find('p', attrs={'class': ''}) # 介绍 43 desc_list.append(desc.get_text()) 44 45 if info: # 判断是否有短评 46 info_list.append(info.get_text()) 47 else: 48 info_list.append('无') 49 score.append(level_star) 50 51 name.append(movie_name) 52 star_con.append(star_num) 53 page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页 54 if page: 55 return name, star_con, score, info_list, desc_list, DOWNLOAD_URL + page['href'] 56 return name, star_con, score, info_list, desc_list,None 57 58 59 def main(): 60 url = DOWNLOAD_URL 61 name = [] 62 star_con = [] 63 score = [] 64 info = [] 65 desc = [] 66 while url: 67 doc = download_page(url) 68 movie, star, level_num, info_list, desc_list, url = get_li(doc) 69 name = name + movie 70 star_con = star_con + star 71 score = score + level_num 72 info = info + info_list 73 desc = desc + desc_list 74 for (i, m, o, p , d) in zip(name, star_con, score, info , desc): 75 col_A = 'A%s' % (name.index(i) + 1) 76 col_B = 'B%s' % (name.index(i) + 1) 77 col_C = 'C%s' % (name.index(i) + 1) 78 col_D = 'D%s' % (name.index(i) + 1) 79 col_E = 'E%s' % (name.index(i) + 1) 80 ws1[col_A] = i 81 ws1[col_B] = m 82 ws1[col_C] = o 83 ws1[col_D] = p 84 ws1[col_E] = d 85 wb.save(filename=dest_filename) 86 87 88 if __name__ == '__main__': 89 main()
用完python写爬虫,再也不想用php写了,方便太多了。php只有无数的正则匹配,效率低,还写的累。。。
滴水成冰,世间不存在毫无意义的付出,时间终会给你答案。