爬虫(1)-手刃豆瓣

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/18 15:53
@Author  : Andrew
@File    : 豆瓣top250.py
"""
import requests  # 拿到页面源代码
import re  # 提取有效信息
import csv

# 创建文件 ,utf-8保证不乱码,newline=""防止excel打开csv时有多余空行
f = open("data.csv", mode="w", encoding="utf-8", newline="")
csvWriter = csv.writer(f)
title = {'title': "电影名字", 'year': "年份", 'score': "评分", 'number': "评价人数"}
csvWriter.writerow(title.values())  # 写入标题
for page in range(0, 250, 25):
    print(page)
    url = "https://movie.douban.com/top250" + "?start=" + str(page) + "&filter="
    print(url)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/77.0.3865.120 "
                      "Safari/537.36 Core/1.77.97.400 QQBrowser/10.9.4621.400 "
    }
    resp = requests.get(url, headers=headers)
    # print(resp.text)
    pageContent = resp.text
    # 解析
    """
        1.你得先观察源代码,找到定位,就针对目标往上多找几层父标签,尽可能让其目标被包裹
        2.就针对本代码而言,找到了li标签,而li标签与div之间有一个换行又或者空白的文本,这里采用惰性匹配,然后匹配到第一个div标签结束
        3.接下来的直到title之间的都无关紧要,就.*?惰性匹配,找到<span class="title">
        4.这里就是电影名字了,为了方便后面获取,给它加分组(?P<movieName>.*?),人数、评分一样的套路
        5.注意正则表达式中不能加多余的空格
    """
    obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<movieName>.*?)</span>.*?<p class="">.*?<br>('
                     r'?P<year>.*?)&nbsp.*?<span class="rating_num" property="v:average">('
                     r'?P<score>.*?)</span>.*?<span>('
                     r'?P<number>.*?)人评价', re.S)
    # 开始匹配
    result = obj.finditer(pageContent)
    for it in result:
        # print(it.group("movieName"))
        # print(it.group("score"))
        # print(it.group("number"))
        # print(it.group("year").strip())  # 去除空格
        dic = it.groupdict()
        dic['year'] = dic['year'].strip()
        csvWriter.writerow(dic.values())
    resp.close()
f.close()

 

posted @ 2022-03-18 17:03  乔十六  阅读(45)  评论(0编辑  收藏  举报