爬虫(1)-手刃豆瓣
# -*- coding: utf-8 -*- """ @Time : 2022/3/18 15:53 @Author : Andrew @File : 豆瓣top250.py """ import requests # 拿到页面源代码 import re # 提取有效信息 import csv # 创建文件 ,utf-8保证不乱码,newline=""防止excel打开csv时有多余空行 f = open("data.csv", mode="w", encoding="utf-8", newline="") csvWriter = csv.writer(f) title = {'title': "电影名字", 'year': "年份", 'score': "评分", 'number': "评价人数"} csvWriter.writerow(title.values()) # 写入标题 for page in range(0, 250, 25): print(page) url = "https://movie.douban.com/top250" + "?start=" + str(page) + "&filter=" print(url) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/77.0.3865.120 " "Safari/537.36 Core/1.77.97.400 QQBrowser/10.9.4621.400 " } resp = requests.get(url, headers=headers) # print(resp.text) pageContent = resp.text # 解析 """ 1.你得先观察源代码,找到定位,就针对目标往上多找几层父标签,尽可能让其目标被包裹 2.就针对本代码而言,找到了li标签,而li标签与div之间有一个换行又或者空白的文本,这里采用惰性匹配,然后匹配到第一个div标签结束 3.接下来的直到title之间的都无关紧要,就.*?惰性匹配,找到<span class="title"> 4.这里就是电影名字了,为了方便后面获取,给它加分组(?P<movieName>.*?),人数、评分一样的套路 5.注意正则表达式中不能加多余的空格 """ obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<movieName>.*?)</span>.*?<p class="">.*?<br>(' r'?P<year>.*?) .*?<span class="rating_num" property="v:average">(' r'?P<score>.*?)</span>.*?<span>(' r'?P<number>.*?)人评价', re.S) # 开始匹配 result = obj.finditer(pageContent) for it in result: # print(it.group("movieName")) # print(it.group("score")) # print(it.group("number")) # print(it.group("year").strip()) # 去除空格 dic = it.groupdict() dic['year'] = dic['year'].strip() csvWriter.writerow(dic.values()) resp.close() f.close()