python requests bs4入门(一)-获取TOP100榜电影名字和主演写入数据库

Requests

获取猫眼TOP100榜电影名字和主演

 1 import time
 2 import requests
 3 from model import *
 4 from bs4 import BeautifulSoup
 5 headers = {"Content-Type": "text/html; charset=utf-8",
 6            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
 7 
 8 url = f"https://www.maoyan.com/board/4?timeStamp=1637291300330&channelId=40011&index=2&signKey=c286a92c2bb667036254185fde905f09&sVersion=1"
 9 r = requests.get(url, timeout=3, headers=headers)
10 print(url)
11 soup = BeautifulSoup(r.text, "html5lib")
12 bb = soup.find('ul', class_="list-pager").text.replace("\n", "").split(" ")
13 page = [i for i in bb if i != '']
14 print(page[-2])
15 for i in range(0,int(page[-2])):
16     p=i*10
17     print(i)
18     url = f"https://www.maoyan.com/board/4?timeStamp=1637053092611&sVersion=1&index=7&signKey=52df1051b8c1478e914905882e09e10a&channelId=40011&requestCode=c30fbaba9d9f7b73a53f83fe71ac0ec13ztmp&offset={p}"
19     r = requests.get(url, timeout=3, headers=headers)
20     print(url)
21     time.sleep(3)
22     print(r.status_code)
23     soup = BeautifulSoup(r.text, "html5lib")
24     # 获取某标签的属性值
25     soup1 = BeautifulSoup(r.text, "html5lib")
26     aa = soup.find_all('div', class_="board-item-content")
27     for a in aa:
28         bb = []
29         datalist={}
30         dd = a.find('p', class_="name")
31         ff = a.find('p', class_="star")
32         ss = a.find('p', class_="releasetime")
33 
34         movie_name=dd.getText().replace("\n","")
35         art_name=ff.getText().replace("\n","").replace(" ","").replace("主演:","")
36         movie_time=ss.getText().replace("\n", "").replace(" ", "").replace("上映时间:","")
37         print(movie_name)
38         print(art_name)
39         print(movie_time)
40         bb.append(dict(movie_name=movie_name, art_name=art_name, movie_time=movie_time, remark='猫眼'))
41         for l in bb:
42             find_data = SQLsession.query(Infos).filter_by(movie_name=l['movie_name'], remark='猫眼').first()
43             if not find_data:
44                 SQLsession.add(Infos(**l))
45         SQLsession.commit()
 1 from sqlalchemy import *
 2 import pymysql
 3 from sqlalchemy.orm import sessionmaker
 4 from sqlalchemy.ext.declarative import declarative_base
 5 from datetime import datetime
 6 
 7 database = 'mysql+pymysql://root:1234@localhost/test_ten?charset=utf8mb4'
 8 
 9 Base = declarative_base()
10 # 创建数据库连接对象
11 engine = create_engine(database)
12 DBSession = sessionmaker(bind=engine)
13 SQLsession = DBSession()
14 
15 
16 # ORM
17 class Infos(Base):
18     __tablename__ = 'test_table1'
19     id = Column(Integer(), primary_key=True)
20     code = Column(String(255))
21     movie_name = Column(String(255))
22     art_name = Column(String(255))
23     movie_time=Column(String(255))
24     status = Column(Integer(), default=1)
25     remark = Column(Text)
26     created = Column(DateTime, default=datetime.now())
27     updated = Column(DateTime, default=datetime.now(), onupdate=datetime.now())
28 
29 
30 Base.metadata.create_all(engine)

 

如果出现以下情况点进去滑动验证重新运行就可以正常获取数据

豆瓣电影

 1 import requests
 2 from bs4 import BeautifulSoup
 3 url='https://movie.douban.com/chart'
 4 headers={"Content-Type":"text/html; charset=utf-8",
 5          "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
 6 print(url)
 7 r=requests.get(url, timeout=3, headers=headers)
 8 soup=BeautifulSoup(r.text,"html5lib")
 9 name=soup.find_all('div',class_='pl2')
10 for i in name:
11     aa=i.getText().replace("\n","").replace("/","")
12     print(aa)

Requests-HTML

 1 # 使用Requests-HTML模拟Ajax请求来获取网页数据
 2 from requests_html import HTMLSession
 3 url='https://movie.douban.com/chart'
 4 headers={"Content-Type":"text/html; charset=utf-8",
 5          "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
 6 print(url)
 7 # Requests-HTML的Ajax加载功能,Requests-HTML只能使用Requests的Session模式
 8 session=HTMLSession()
 9 r=session.get(url)
10 # Chromium浏览器加载网页
11 # Ajax加载功能由render()方法实现
12 r.html.render()
13 name=r.html.find('div.pl2')
14 for g in name:
15     print(g.text)

json解析

参考:python的requests爬取Json数据,从Json数据中提取标题和图片(腾讯视频的某综艺节目)_Mr.Pan_学狂-CSDN博客

 

 

 

 1 # 使用Requests-HTML模拟Ajax请求来获取网页数据
 2 from requests_html import HTMLSession
 3 import json
 4 url='https://movie.douban.com/tag/#/'
 5 headers={"Content-Type":"text/html; charset=utf-8",
 6          "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
 7 print(url)
 8 # Requests-HTML的Ajax加载功能,Requests-HTML只能使用Requests的Session模式
 9 session=HTMLSession()
10 r=session.get(url)
11 # Chromium浏览器加载网页
12 # Ajax加载功能由render()方法实现
13 r.html.render()
14 # name=r.html.find('span.title')
15 # for g in name:
16 #     print(g.text)
17 for d in range(0,5):
18     p = d * 20
19     url1=f"https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={p}"
20     r2 = session.get(url1)
21     print(r2.text)
22     json_data = json.loads(r2.text)  # 加载内容
23     Data_ls = json_data['data'] # 取出data对应的值,是list列表类型
24     title_dt = {}  # 定义一个空字典,用于存储标题和对应的演员
25     for D in Data_ls:
26         try:  # 加入异常处理机制,使不满足条件的时候跳过,而不报错。
27             if type(D) is dict:
28                 title_dt[D['title']] = D['casts']
29         except Exception:  # 出现异常的时候,跳过
30             continue
31     print(title_dt)#打印字典


posted @ 2021-12-08 16:00  野猫炫  阅读(208)  评论(0编辑  收藏  举报