记一次python爬虫实战,豆瓣电影Top250爬虫
1 import requests
2 from bs4 import BeautifulSoup
3 import re
4 import traceback
5
6 def GetHtmlText(url):
7 for i in range(0,1): #尝试两次
8 try:
9 r=requests.get(url)
10 r.encoding = 'utf-8'
11 r.raise_for_status();
12 return r.text;
13 except:
14 traceback.print_exc()
15 continue
16 return
17
18 def GetMovieInfo(url):
19 movieDict={}
20 for page in range(0,10):
21 try:
22 page_url = '?start='+str(page*25)
23 html = GetHtmlText(url+page_url)
24 Soup = BeautifulSoup(html, 'html.parser')
25 movie = Soup.find(name="ol",class_='grid_view') #所有电影信息
26 movieList = movie.find_all(name='li') #电影信息列表
27 for single in movieList: #循环单页的电影信息
28 num = single.find(name='em').string #电影排名
29 title1 = single.find_all(name='span',class_='title')
30 title2 = single.find(name='span',class_='other').string
31 if len(title1)==2:
32 movieTitle = title1[0].string+title1[1].string+title2.string
33 else:
34 movieTitle = title1[0].string+title2.string
35 classBD = single.find(name='div',class_='bd').contents #我也不知道为什么bs给我返回7个节点
36 movieActor = classBD[1].text
37 movieRating = re.findall(r'\d?\.\d?',str(classBD[3]))[0]
38 movieQuote = classBD[5].text
39 movieDict['num'] = num
40 movieDict['movieTitle'] = movieTitle
41 movieDict['actor'] = movieActor
42 movieDict['rating'] = movieRating
43 movieDict['quote'] = movieQuote
44 printMovieInfo(movieDict)
45 except:
46 traceback.print_exc()
47
48
49 def printMovieInfo(Info):
50 try:
51 with open('/home/why/py/movieInfo.txt','a',encoding='utf-8') as f:
52 f.write(str(Info['num']+Info['movieTitle']+'\n'+Info['actor']+'\n评分:'+Info['rating']+'\n评价:'+Info['quote']+'\n'))
53 except:
54 traceback.print_exc()
55
56
57 def main():
58 base_url = 'https://movie.douban.com/top250'
59 GetMovieInfo(base_url)
60 main()
结果: