爬取猫眼电影榜单TOP100榜-以mysql数据库保存
在数据库中建库建表
# 连接到mysql数据库 mysql -h127.0.0.1 -uroot -p123456 # 建库建表 create database maoyandb charset utf8; use maoyandb; create table filmtab( name varchar(100), star varchar(300), time varchar(50) )charset=utf8;
回顾pymysql基本使用
import pymysql # 创建2个对象 db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8') cursor = db.cursor() # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 ins = 'insert into filmtab values(%s,%s,%s)' cursor.execute(ins,['霸王别姬','张国荣','1993']) db.commit() # 关闭 cursor.close() db.close()
来试试高效的executemany()方法?
import pymysql # 创建2个对象 db = pymysql.connect('192.168.153.137','tiger','123456','maoyandb',charset='utf8') cursor = db.cursor() # 抓取的数据 film_list = [('月光宝盒','周星驰','1994'),('大圣娶亲','周星驰','1994')] # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list) db.commit() # 关闭 cursor.close() db.close()
将电影信息存入MySQL数据库(尽量使用executemany方法)
from urllib import request import re import time import random from useragents import ua_list import pymysql class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 # 创建2个对象 self.db = pymysql.connect( 'localhost','root','123456','maoyandb',charset='utf8' ) self.cursor = self.db.cursor() def get_html(self,url): headers = { 'User-Agent' : random.choice(ua_list) } req = request.Request(url=url,headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self,html): re_bds = r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>' pattern = re.compile(re_bds,re.S) # film_list: [('霸王别姬','张国荣','1993'),()] film_list = pattern.findall(html) # 直接调用写入函数 self.write_html(film_list) # # mysql - execute() # def write_html(self,film_list): # ins = 'insert into filmtab values(%s,%s,%s)' # for film in film_list: # L = [ # film[0].strip(), # film[1].strip(), # film[2].strip()[5:15] # ] # self.cursor.execute(ins,L) # # 千万别忘了提交到数据库执行 # self.db.commit() # mysql - executemany([(),(),()]) def write_html(self, film_list): L = [] ins = 'insert into filmtab values(%s,%s,%s)' for film in film_list: t = ( film[0].strip(), film[1].strip(), film[2].strip()[5:15] ) L.append(t) self.cursor.executemany(ins, L) # 千万别忘了提交到数据库执行 self.db.commit() def main(self): for offset in range(0,31,10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1,2)) # 断开数据库连接 self.cursor.close() self.db.close() if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))
做个SQL查询
1、查询20年以前的电影的名字和上映时间 select name,time from filmtab where time<(now()-interval 20 year); 2、查询1990-2000年的电影名字和上映时间 select name,time from filmtab where time>='1990-01-01' and time<='2000-12-31';