1 from urllib import request 2 from urllib import parse 3 import time 4 import re 5 import pymysql 6 7 class MaoyanSpider(object): 8 def __init__(self): 9 self.baseurl = 'https://maoyan.com/board/4?offset=' 10 self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'} 11 # 爬取页数计数 12 self.page = 1 13 # 创建2个对象 14 self.db = pymysql.connect( 15 'localhost','root','123456','spider', 16 charset='utf8' 17 ) 18 self.cursor = self.db.cursor() 19 20 21 # 获取页面 22 def get_page(self,url): 23 req = request.Request(url,headers=self.headers) 24 res = request.urlopen(req) 25 html = res.read().decode('utf-8') 26 # 直接调用解析函数 27 self.parse_page(html) 28 29 # 解析页面 30 def parse_page(self,html): 31 # 正则解析 32 p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) 33 r_list = p.findall(html) 34 # r_list : [('霸王别姬','张国荣','1993'),(),()] 35 self.write_page(r_list) 36 37 # 保存数据(存到mysql数据库) 38 def write_page(self,r_list): 39 ins = 'insert into film(name,star,time) \ 40 values(%s,%s,%s)' 41 for rt in r_list: 42 film_list = [ 43 rt[0].strip(), 44 rt[1].strip(), 45 rt[2].strip()[5:15] 46 ] 47 48 self.cursor.execute(ins,film_list) 49 # 提交到数据库执行 50 self.db.commit() 51 52 # 主函数 53 def main(self): 54 # 用range函数可获取某些查询参数的值 55 for offset in range(0,41,10): 56 url = self.baseurl + str(offset) 57 self.get_page(url) 58 print('第%d页爬取成功' % self.page) 59 self.page += 1 60 time.sleep(1) 61 # 等所有页面爬完后再关闭 62 self.cursor.close() 63 self.db.close() 64 65 if __name__ == '__main__': 66 spider = MaoyanSpider() 67 spider.main()
不喜欢写代码,就喜欢改代码,C#/Python都可以,欢迎各位留言探讨!