使用requests库爬取豆瓣电影Top250相关数据

re库

re库的基本使用:https://www.cnblogs.com/ikventure/p/14853758.html

菜鸟教程:https://www.runoob.com/python3/python3-reg-expressions.html

  • findall() 以列表形式返回所有匹配的子串

  • finditer() 以迭代器形式返回所有匹配的子串

  • search() 搜索第一个匹配的子串

  • match() 只在开始位置匹配

  • compile() 可以对正则进行预加载

    pat = re.compile(r'\d{3}')
    res = pat.search('leooooo223')
    print(res.group()) #223
    
  • 单独提取正则中的内容(分组)

import re

s = """
<div class='messi'><span id='10'>梅西</span></div>
<div class='neymar'><span id='11'>内马尔</span></div>
<div class='kun'><span id='19'>阿圭罗</span></div>
<div class='suarez'><span id='9'>苏亚雷斯</span></div>
<div class='rakitic'><span id='4'>拉基蒂奇</span></div>
"""

# 需求匹配到 messi 10 梅西
# re.S 让.能匹配换行符
obj1 = re.compile(r"<div class='.*?'><span id='\d+'>.*?</span></div>", re.S)

result1 = obj1.finditer(s)
for i in result1:
    print(i.group())

# 在待匹配字符前加上 ?P<分类名称>,并用()括起来,如 <span id='(?P<number>\d+)'>
obj2 = re.compile(r"<div class='.*?'><span id='(?P<number>\d+)'>(?P<ChineseName>.*?)</span></div>", re.S)

result2 = obj2.finditer(s)
for j in result2:
    print(j.group("number"), j.group('ChineseName'))

豆瓣电影top250

网址:https://movie.douban.com/top250

每页25条数据,第二页:https://movie.douban.com/top250?start=25&filter=

修改’start=‘后的数字即可获取完整数据。

  1. 判断网页为服务器渲染还是客户端渲染(查看页面源代码) --服务器
  2. 获取页面源代码 requests.get()
  3. 解析数据 re.compile() 通过正则表达式分组匹配
  4. 匹配数据 finditer
  5. 展示数据 fstring填充 中文空格 chr(12288)
  6. 保存数据 csv
import requests
import re
import csv

url = "https://movie.douban.com/top250"
ua = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = requests.get(url, headers=ua)
page_source = page.text

pat = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>'
                 r'.*?<p class="">.*?<br>(?P<year>.*?)&nbsp'
                 r'.*?<span class="rating_num" property="v:average">(?P<rate>.*?)</span>'
                 r'.*?<span>(?P<number>.*?)人评价</span>', re.S)

result = pat.finditer(page_source)

f = open('dbtop250.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.writer(f)

for item in result:
    """
    显示测试 (电影名称中英文混合时依然会对不齐)
    name = item.group("name")
    year = item.group("year").strip()
    rate = item.group("rate")
    number = item.group("number")
    print(f"{name:{chr(12288)}<15}{year:<10}{rate:<10}{number:>10}")
    """
    dic = item.groupdict()
    dic['year'] = dic['year'].strip()
    csv_writer.writerow(dic.values())
f.close()
page.close()
print("over!")

优化代码结构,获取完整数据

"""
modified douban movie top250
"""

import requests
import re
import csv

# url = "https://movie.douban.com/top250"
ua = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}


def get_page(url):
    page = requests.get(url, headers=ua)
    page_source = page.text
    pat = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>'
                     r'.*?<p class="">.*?<br>(?P<year>.*?)&nbsp'
                     r'.*?<span class="rating_num" property="v:average">(?P<rate>.*?)</span>'
                     r'.*?<span>(?P<number>.*?)人评价</span>', re.S)
    res = pat.finditer(page_source)
    page.close()
    return res


with open('db250.csv', 'w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    for start in range(0, 250, 25):
        douban_url = f'https://movie.douban.com/top250?start={start}&filter='
        result = get_page(douban_url)
        for item in result:
            dic = item.groupdict()
            dic['year'] = dic['year'].strip()
            csv_writer.writerow(dic.values())

print("over!")

电影天堂

  1. 在主页面选择需要获取的版块
  2. 找到该板块下各电影的链接
  3. 获取该链接的页面源代码,找到下载地址。
import requests
import re

domain = "https://dytt89.com/"

page = requests.get(domain)
page.encoding = 'gb2312' # page中可以查看charset=gb2312
page_source = page.text

# 定位‘2021必看热片’
pat1 = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
# 定位 超链接 <a href=''>xx</a>
pat2 = re.compile(r"<a href='(?P<link>.*?)'", re.S)
# 定位 片名
pat3 = re.compile(r"◎片  名(?P<name>.*?)<br />", re.S)
# 定位 下载地址
pat4 = re.compile(r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">')

res1 = pat1.search(page_source)
ul = res1.group("ul")

res2 = pat2.finditer(ul)
child_url_list = [] # 子页面链接列表
for item2 in res2:
    child_link = item2.group('link')
    child_url = domain + child_link
    child_url_list.append(child_url)

# 提取子页面内容
for url in child_url_list:
    child_page = requests.get(url)
    child_page.encoding = 'gb2312'
    child_page_source = child_page.text
    # 部分页面没有片名,换成标题名称
    try:
        res3 = pat3.search(child_page_source).group("name").strip()
    except AttributeError:
        pat_back = re.compile(r'<div class="title_all"><h1>(?P<title>.*?)</h1>')
        res3 = pat_back.search(child_page_source).group("title").strip()
    print(res3)
    res4 = pat4.search(child_page_source).group("download")
    print(res4)

page.close()
posted @ 2021-08-16 19:10  ikventure  阅读(265)  评论(0编辑  收藏  举报