python实战项目 — 使用bs4 爬取猫眼电影热榜(存入本地txt、以及存储数据库列表)
案例一:
重点:
1. 使用bs4 爬取
2. 数据写入本地 txt
from bs4 import BeautifulSoup import requests url = "http://maoyan.com/board" header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } rsq = requests.get(url=url,headers=header).text soup = BeautifulSoup(rsq, "lxml") # 所有信息都在 <dd> </dd>标签中,先提取出这个标签 items = soup.select('dd') # 构建本地txt文档 with open("D://maoyan.txt", "w", encoding="utf-8") as f: for item in items: # 提取标题 title=item.select('p a[data-act="boarditem-click" ]')[0].get_text() # 提取主演 star=item.select('p[class = "star"]')[0].get_text().replace("\n", "").strip(" ") # 提取分数 score=item.select('p[class = "score"]')[0].get_text().strip('\n').strip(' ') # 提取上映时间 releasetime=item.select('p[class = "releasetime"]')[0].get_text() # 数据整合 datas=title + " " + releasetime + " " + star + " " + score + "\n" print(datas) # 利用for循环把每条datas信息写入本地 f.write(datas) f.close() print("Sucessful")
优化后
重点:
1. 链接数据库,创建表
2. 在线爬取写入数据库表
# 下面需要用requests 请求,不要用 “ from urllib import request ”
from bs4 import BeautifulSoup from urllib import request import time import pymysql # 记录开始时间 start_time = time.time() print("尝试链接服务器") try: # 链接到服务器 connect = pymysql.connect(host='主机地址', user='用户', passwd='密码', db='数据库名', port=3306) # 创建游标,对数据进行操作 cursor = connect.cursor() # 如果存在同名表就删除 cursor.execute('DROP TABLES IF EXISTS maoyan_datas') # 使用SQL语句创建表 sql1 =""" create table maoyan_datas( title CHAR(100), star CHAR(200), score CHAR(50), releasetime CHAR(200) )""" # 执行上面的sql语句 cursor.execute(sql1) # 提交执行 connect.commit() # db.close() print("链接数据库创建表完成") except Exception as e: print("链接数据库创建表失败" + str(e)) url="http://maoyan.com/board" rsq = request.urlopen(url) html = rsq.read().decode() # 解析网址 soup = BeautifulSoup(html,"lxml") # 提取前端结构中 <dd> </dd> 标签部分,因为此部分包含全部信息 items = soup.select('dd') print("已获取网站数据") sql2 = 'insert into maoyan_datas(title,star,score,releasetime) values(%s,%s,%s,%s)' i = 0 for item in items: # 提取标题 title = item.select('p a[data-act="boarditem-click" ]')[0].get_text() # 提取主演 star = item.select('p[class = "star"]')[0].get_text().replace("\n","").strip(" ") # 提取分数 score = item.select('p[class = "score"]')[0].get_text().strip('\n').strip(' ') # 提取上映时间 releasetime = item.select('p[class = "releasetime"]')[0].get_text() # 数据拼接 all = [title, str(star),str(score),str(releasetime)] #打印当前获取的电影信息 print(all) # 把电影信息写入数据库 cursor.execute(sql2,all) i = i +1 print("已写入 %s 行数据"%i) connect.close() print("done,消耗了时间: %f s" % (time.time() - start_time))