爬虫实战(五):爬豆瓣top250
一、网址分析
1、 页面分析
通过抓包分析,可得数据不是动态加载出来的,而是静态页面,故我们可以直接发送请求到页面,就可以获得数据了
2、 源码分析
通过F12调试工具可以得到页面数据,即,该页面的数据,存储在一个类名为grid_view
的ol标签里面,同时该类名在页面中唯一,故我们可以使用这个节点定位到我们的数据,遍历li标签,获取内容
3、 内容解析
4、 链接分析
| """ |
| 1. https://movie.douban.com/top250?start=0 |
| 2. https://movie.douban.com/top250?start=25 |
| 3. https://movie.douban.com/top250?start=50 |
| n. https://movie.douban.com/top250?start=25*(n-1) |
| """ |
| urls = [https://movie.douban.com/top250?start=25*(i-1) for i in range(11)] |
故,我们可以使用for循环,或者是先生成链接,采用入栈的形式来访问,亦或者可以使用递归的方式来爬取页面
二、编写代码
1、 获取每页url
| |
| |
| __author__ = "A.L.Kun" |
| __file__ = "123.py" |
| __time__ = "2022/7/6 10:19" |
| |
| import requests, re |
| from lxml import etree |
| from fake_useragent import UserAgent |
| import pandas as pd |
| |
| urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)] |
| |
| headers = { |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', |
| 'Accept-Encoding': 'gzip, deflate, br', |
| 'Accept-Language': 'zh-CN,zh;q=0.9', |
| 'Cache-Control': 'no-cache', |
| 'Connection': 'keep-alive', |
| 'Host': 'movie.douban.com', |
| 'Pragma': 'no-cache', |
| 'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', |
| 'sec-ch-ua-mobile': '?0', |
| 'sec-ch-ua-platform': '"Windows"', |
| 'Sec-Fetch-Dest': 'document', |
| 'Sec-Fetch-Mode': 'navigate', |
| 'Sec-Fetch-Site': 'none', |
| 'Sec-Fetch-User': '?1', |
| 'Upgrade-Insecure-Requests': '1', |
| 'user-agent': UserAgent().random |
| } |
| |
| lis_data = [] |
| |
| while urls: |
| print(urls.pop()) |
2、 获取ol里面的li标签
| def get_tags(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers=headers) |
| resp.encoding = "utf-8" |
| tree = etree.HTML(resp.text) |
| ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') |
| for li in ol: |
| print(li) |
| |
| get_tags("https://movie.douban.com/top250?start=0") |
3、 获取数据
| def get_data(li): |
| imgSrc = li.xpath(".//img/@src")[0] |
| try: |
| imgSrc = imgSrc.replace("webp", "jpg") |
| except Exception as e: |
| imgSrc = "图片没有找到" |
| title = li.xpath(".//img/@alt")[0] |
| detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0] |
| detail = li.xpath(".//div[@class='bd']/p[1]/text()") |
| time = re.search(r"\d+", detail[1]).group() |
| type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1])) |
| score = li.xpath(".//span[@class='rating_num']/text()")[0] |
| quote = li.xpath(".//span[@class='inq']/text()")[0] |
| |
| lis_data.append({ |
| "标题": title, |
| "图片链接": imgSrc, |
| "详情页链接": detailUrl, |
| "出版时间": time, |
| "电影类型": type_, |
| "评分": score, |
| "格言": quote |
| }) |
| |
| |
| |
| resp = requests.get("https://movie.douban.com/top250?start=25", headers=headers) |
| resp.encoding = "utf-8" |
| tree = etree.HTML(resp.text) |
| ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') |
| for li in ol : |
| get_data(li) |
| print(lis_data) |
4、 数据清洗
| def parse_data(): |
| df = pd.DataFrame(lis_data) |
| new_df = df.dropna() |
| |
| |
| new_df.to_excel("./douban.xlsx", index=None) |
| |
| parse_data() |
三、完整代码
| |
| |
| __author__ = "A.L.Kun" |
| __file__ = "123.py" |
| __time__ = "2022/7/6 10:19" |
| |
| import requests, re |
| from lxml import etree |
| from fake_useragent import UserAgent |
| import pandas as pd |
| from logging import Logger |
| |
| log = Logger(__name__) |
| |
| urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)] |
| |
| headers = { |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', |
| 'Accept-Encoding': 'gzip, deflate, br', |
| 'Accept-Language': 'zh-CN,zh;q=0.9', |
| 'Cache-Control': 'no-cache', |
| 'Connection': 'keep-alive', |
| 'Host': 'movie.douban.com', |
| 'Pragma': 'no-cache', |
| 'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', |
| 'sec-ch-ua-mobile': '?0', |
| 'sec-ch-ua-platform': '"Windows"', |
| 'Sec-Fetch-Dest': 'document', |
| 'Sec-Fetch-Mode': 'navigate', |
| 'Sec-Fetch-Site': 'none', |
| 'Sec-Fetch-User': '?1', |
| 'Upgrade-Insecure-Requests': '1', |
| 'user-agent': UserAgent().random |
| } |
| |
| lis_data = [] |
| |
| def get_data(li): |
| imgSrc = li.xpath(".//img/@src")[0] |
| try: |
| imgSrc = imgSrc.replace("webp", "jpg") |
| except Exception as e: |
| imgSrc = "图片没有找到" |
| title = li.xpath(".//img/@alt")[0] |
| detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0] |
| detail = li.xpath(".//div[@class='bd']/p[1]/text()") |
| time = re.search(r"\d+", detail[1]).group() |
| type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1])) |
| score = li.xpath(".//span[@class='rating_num']/text()")[0] |
| try: |
| quote = li.xpath(".//span[@class='inq']/text()")[0] |
| except Exception as e: |
| quote = "暂时没有格言哦!" |
| |
| lis_data.append({ |
| "标题": title, |
| "图片链接": imgSrc, |
| "详情页链接": detailUrl, |
| "出版时间": time, |
| "电影类型": type_, |
| "评分": score, |
| "格言": quote |
| }) |
| |
| def get_tags(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers=headers) |
| resp.encoding = "utf-8" |
| tree = etree.HTML(resp.text) |
| ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li') |
| for li in ol: |
| get_data(li) |
| log.info(f"{url},数据获取完成") |
| |
| def parse_data(): |
| df = pd.DataFrame(lis_data) |
| new_df = df.dropna() |
| |
| |
| new_df.to_excel("./douban.xlsx", index=None, encoding="utf-8") |
| |
| |
| def main(): |
| while urls: |
| get_tags(urls.pop()) |
| parse_data() |
| |
| if __name__ == "__main__": |
| main() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?