爬虫第二篇:爬虫详解之存储数据
将数据存储到CSV文件
import urllib.request import re import csv url = 'https://maoyan.com/board/4?offset=10' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # 获取页面 req = urllib.request.Request(url, headers=headers) res = urllib.request.urlopen(req) html = res.read().decode('utf-8') # 解析页面 p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) rList = p.findall(html) # 存储到CSV for r in rList: r = [r[0].strip(),r[1].strip(),r[2].strip()] with open('my1.csv','a',newline="") as f: # 创建写入对象 writer = csv.writer(f) writer.writerow(r)
将数据存储到mysql中
import urllib.request import re import pymysql import warnings warnings.filterwarnings("ignore") url = 'https://maoyan.com/board/4?offset=10' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # 创建数据库连接对象 db = pymysql.connect( "localhost", "root", "123456", "spiderdb", charset="utf8") # 游标对象 cursor = db.cursor() # 获取页面 req = urllib.request.Request(url, headers=headers) res = urllib.request.urlopen(req) html = res.read().decode('utf-8') # 解析页面 p = re.compile( '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S) rList = p.findall(html) # 存储到CSV ins = 'insert into film(\ name,star,releasetime) \ values(%s,%s,%s)' for r in rList: L = [r[0].strip(), r[1].strip(), r[2].strip()[5:15] ] cursor.execute(ins,L) db.commit()
将数据存储到pymongo中
import urllib.request import re import pymongo url = 'https://maoyan.com/board/4?offset=10' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # 创建连接对象 conn = pymongo.MongoClient("127.0.0.1", 27017) db = conn["spiderdb"] myset = db["film"] # 获取页面 req = urllib.request.Request(url, headers=headers) res = urllib.request.urlopen(req) html = res.read().decode('utf-8') # 解析页面 p = re.compile( '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S) rList = p.findall(html) # 存储到pymongo for r in rList: d = { "name": r[0].strip(), "star": r[1].strip(), "releasetime": r[2].strip() } myset.insert_one(d)