数据持久化存储
一数据持久化存储-csv文件
1.作用
将爬取的数据存放到本地的csv文件中
2.使用流程
1、导入模块 2、打开csv文件 3、初始化写入对象 4、写入数据(参数为列表) import csv with open('film.csv','w') as f: writer = csv.writer(f) writer.writerow([])
writer.writerows([(),(),()])
3.示例代码
创建text.csv文件,在文件中写入数据
# 单行写入(writerow([])) import csv with open('test.csv','w',newline='') as f: writer = csv.writer(f) writer.writerow(['步惊云','36']) writer.writerow(['聂风','36']) # 多行写入(writerows([(),(),()] import csv with open('test.csv','w') as f: writer = csv.writer(f) writer.writerows([('聂风','36'),('秦霜','25'),('孔慈','30')])
练习:猫眼电影数据存入本地 maoyanfilm.csv 文件 - 使用writerow方法实现
思考:使用 writerows()方法实现?
import csv from urllib import request, parse import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): # 创建正则的编译对象 re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> ' pattern = re.compile(re_, re.S) # film_list:[('霸王别姬','张国荣','1993')] film_list = pattern.findall(html) self.write_html(film_list) # 存入csv文件-writerrows def write_html(self, film_list): L = [] with open('maoyanfilm.csv', 'a',newline='') as f: # 初始化写入对象,注意参数f不能忘 writer = csv.writer(f) for film in film_list: t = ( film[0].strip(), film[1].strip(), film[2].strip()[5:15] ) self.num += 1 L.append(t) # writerow()参数为列表 writer.writerows(L) print(L) # def write_html(self,film_list): # with open('maoyanfilm.csv','a') as f: # #初始化写入对象,注意参数f不能忘 # writer=csv.writer(f) # for film in film_list: # L=[ # film[0].strip(), # film[1].strip(), # film[2].strip()[5:15] # ] # self.num+=1 # # writerow()参数为列表 # writer.writerow(L) def main(self): for offset in range(0, 91, 10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1, 2)) print('共抓取数据', self.num, "部") if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))
ua_list = [ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', ]
# 连接到mysql数据库 mysql -h127.0.0.1 -uroot -p123456 # 建库建表 create database maoyandb charset utf8; use maoyandb; create table filmtab( name varchar(100), star varchar(300), time varchar(50) )charset=utf8;
import pymysql # 创建2个对象 db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8') cursor = db.cursor() # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 cursor.execute('insert into filmtab values(%s,%s,%s)',['霸王别姬','张国荣','1993']) db.commit() # 关闭 cursor.close() db.close()
import pymysql # 创建2个对象 db = pymysql.connect('192.168.153.137','tiger','123456','maoyandb',charset='utf8') cursor = db.cursor() # 抓取的数据 film_list = [('月光宝盒','周星驰','1994'),('大圣娶亲','周星驰','1994')] # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list) db.commit() # 关闭 cursor.close() db.close()
import pymysql # __init__(self): self.db = pymysql.connect('IP',... ...) self.cursor = self.db.cursor() # write_data(self): self.cursor.execute('sql',[data1]) self.cursor.executemany('sql',[(data1),(data2),(data3)]) self.db.commit() # main(self): self.cursor.close() self.db.close()
import pymysql from urllib import request, parse import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 self.db = pymysql.connect( 'localhost', 'root', '123456', 'maoyandb', charset='utf8' ) self.cursor = self.db.cursor() def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): # 创建正则的编译对象 re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> ' pattern = re.compile(re_, re.S) # film_list:[('霸王别姬','张国荣','1993')] film_list = pattern.findall(html) self.write_html(film_list) def write_html(self, film_list): ins = 'insert into filmtab values(%s,%s,%s)' for film in film_list: L = [ film[0].strip(), film[1].strip(), film[2].strip()[5:15] ] self.num+=1 print('爬取成功',self.num,'部') self.cursor.execute(ins, L) #提交到数据库执行 self.db.commit() def main(self): for offset in range(0, 91, 10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1, 2)) print('共抓取数据', self.num, "部") # 断开数据库 self.cursor.close() self.db.close() if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))
import pymysql from urllib import request, parse import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 self.db = pymysql.connect( 'localhost', 'root', '123456', 'maoyandb', charset='utf8' ) self.cursor = self.db.cursor() def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): # 创建正则的编译对象 re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> ' pattern = re.compile(re_, re.S) # film_list:[('霸王别姬','张国荣','1993')] film_list = pattern.findall(html) self.write_html(film_list) #mysql-executemany def write_html(self, film_list): L=[] ins = 'insert into filmtab values(%s,%s,%s)' for film in film_list: t = [ film[0].strip(), film[1].strip(), film[2].strip()[5:15] ] self.num+=1 L.append(t) print('爬取成功',self.num,'部') self.cursor.executemany(ins, L) #提交到数据库执行 self.db.commit() def main(self): for offset in range(0, 91, 10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1, 2)) print('共抓取数据', self.num, "部") # 断开数据库 self.cursor.close() self.db.close() if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))
1、查询20年以前的电影的名字和上映时间 select name,time from film where time<=(now()-interval 20 year); 2、查询1990-2000年的电影名字和上映时间 select name,time from film where time>='1990-01-01' and time<='2000-12-31';
三数据持久化存储-MongoDB数据库
(一)特点
1.非关系型数据库
2.库/集合(MySQL中表),文档(MySQL中表记录)
3.无须手动建库建集合
4.基本命令
#查看所有的库 >show dbs #切换库 >use 库名 #查看库中所有的集合 >show collections #查看集合中所有文档 >db.集合名.find().pretty() #统计集合中文档个数 >db.集合名.count()
(二)pymongo操作mongodb数据库
import pymongo #1.连接对象 conn = pymongo.MongoClient( host='127.0.0.1', port=27017 ) #2.库对象 db = conn['maoyandb'] #第一种方法 # db = conn.maoyandb #第二种方法 #3.集合对象 myset = db['filmtab'] #4.插入数据库 #插入一个 myset.insert_one({'name':'赵敏'}) #插入多个 myset.insert_many()
import pymongo # __init__(self): self.conn = pymongo.MongoClient('IP',27017) self.db = self.conn['db_name'] self.myset = self.db['set_name'] # write_data(self): self.myset.insert_one(dict) # MongoDB - Commmand >show dbs >use db_name >show collections >db.collection_name.find().pretty() >db.collection_name.count() >db.collection_name.drop() >db.dropDatabase()
练习:
import pymongo from urllib import request import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 # 创建3个对象 self.conn = pymongo.MongoClient(host='127.0.0.1',port=27017) self.db = self.conn['maoyandb'] self.myset = self.db['filmset'] def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): # 创建正则的编译对象 re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> ' pattern = re.compile(re_, re.S) # film_list:[('霸王别姬','张国荣','1993')] film_list = pattern.findall(html) self.write_html(film_list) # mysql-executemany def write_html(self, film_list): for film in film_list: film_dict = { 'name':film[0].strip(), 'star':film[1].strip(), 'time':film[2].strip()[5:15] } self.num+=1 #插入mongodb数据库 self.myset.insert_one(film_dict) print('爬取成功', self.num, '部') def main(self): for offset in range(0, 91, 10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1, 2)) print('共抓取数据', self.num, "部") if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))
# 整体思路 1、爬取一级页面,提取 所需数据+链接,继续跟进 2、爬取二级页面,提取 所需数据+链接,继续跟进 3、... ... # 代码实现思路 1、所有数据最终都会在一级页面解析函数中拿到 2、避免重复代码 - 请求、解析需定义函数
# 地址 电影天堂 - 2019年新片精品 - 更多 # 目标 电影名称、下载链接 # 分析 *********一级页面需抓取*********** 1、电影名称 2、电影链接 *********二级页面需抓取*********** 1、下载链接
实现步骤:
2、找URL规律
第1页 :https://www.dytt8.net/html/gndy/dyzz/list_23_1.html 第2页 :https://www.dytt8.net/html/gndy/dyzz/list_23_2.html 第n页 :https://www.dytt8.net/html/gndy/dyzz/list_23_n.html
写正则表达式:
1、一级页面正则表达式 <table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table> 2、二级页面正则表达式 <td style="WORD-WRAP.*?>.*?>(.*?)</a>
代码实现:
import csv from urllib import request import re from useragents import ua_list import time import random class FilmSkySpider(object): def __init__(self): self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' # 获取相应内容 def get_html(self, url): headers = {'User-Agent': random.choice(ua_list)} req = request.Request(url=url, headers=headers) res = request.urlopen(req) #通过网站查看网页远吗,查看网站charset='gb2312' #如果遇到解码错误,识别不了一些字符,则 ignore html = res.read().decode('gb2312','ignore') return html # 正则解析功能函数 def re_func(self, re_bds, html): pattern = re.compile(re_bds, re.S) r_list = pattern.findall(html) return r_list # 获取数据函数 html是一级页面相应内容 def parse_page(self, html): # 想办法获取到 电影名称和下载链接 re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>' # one_page_list:[('/html/xxx','幸福猎人'),()] one_page_list = self.re_func(re_bds, html) with open('movefilm.csv', 'a', newline='') as f: # 初始化写入对象,注意参数f不能忘 writer = csv.writer(f) item = {} for film in one_page_list: item['name'] = film[1].strip() link = 'https://www.dytt8.net' + film[0] item['download'] = self.parse_two_page(link) time.sleep(random.uniform(1, 3)) print(item) writer.writerows(item) # 解析二级页面数据 def parse_two_page(self, link): html = self.get_html(link) re_bds = r'<td style="WORD-WRAP.*?>.*?>(.*?)</a>' # two_page_list:['ftp://xxxx.mkv'] two_page_list = self.re_func(re_bds, html) download = two_page_list[0].strip() return download def main(self): for page in range(1, 11): url = self.url.format(page) html = self.get_html(url) self.parse_page(html) if __name__ == '__main__': start = time.time() spider = FilmSkySpider() spider.main() end = time.time() print('执行时间:%.2f' % (end - start))
把电影天堂数据存入MySQL数据库