1 from urllib import request
 2 from urllib import parse
 3 import time
 4 import re
 5 import pymysql
 6 
 7 class MaoyanSpider(object):
 8     def __init__(self):
 9         self.baseurl = 'https://maoyan.com/board/4?offset='
10         self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
11         # 爬取页数计数
12         self.page = 1
13         # 创建2个对象
14         self.db = pymysql.connect(
15             'localhost','root','123456','spider',
16             charset='utf8'
17         )
18         self.cursor = self.db.cursor()
19 
20 
21     # 获取页面
22     def get_page(self,url):
23         req = request.Request(url,headers=self.headers)
24         res = request.urlopen(req)
25         html = res.read().decode('utf-8')
26         # 直接调用解析函数
27         self.parse_page(html)
28 
29     # 解析页面
30     def parse_page(self,html):
31         # 正则解析
32         p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
33         r_list = p.findall(html)
34         # r_list : [('霸王别姬','张国荣','1993'),(),()]
35         self.write_page(r_list)
36 
37     # 保存数据(存到mysql数据库)
38     def write_page(self,r_list):
39         ins = 'insert into film(name,star,time) \
40                values(%s,%s,%s)'
41         for rt in r_list:
42             film_list = [
43                 rt[0].strip(),
44                 rt[1].strip(),
45                 rt[2].strip()[5:15]
46              ]
47 
48             self.cursor.execute(ins,film_list)
49             # 提交到数据库执行
50             self.db.commit()
51 
52     # 主函数
53     def main(self):
54         # 用range函数可获取某些查询参数的值
55         for offset in range(0,41,10):
56             url = self.baseurl + str(offset)
57             self.get_page(url)
58             print('第%d页爬取成功' % self.page)
59             self.page += 1
60             time.sleep(1)
61         # 等所有页面爬完后再关闭
62         self.cursor.close()
63         self.db.close()
64 
65 if __name__ == '__main__':
66     spider = MaoyanSpider()
67     spider.main()