从零开始--01-前期数据准备工作
在线视频数据爬取准备工作
爬取好看视频网站的数据
网站分析
搜索框内的提示信息
url: https://haokan.baidu.com/videoui/api/hotwords?sfrom=pc
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"hotwords": [
{
"title": "曝中芯国际内讧",
"hot_num": 4952186
},
{
"title": "男子驾校身亡校方打人",
"hot_num": 4928534
},
{
"title": "于正否认代孕生子",
"hot_num": 4916454
},
{
"title": "金晨拒绝陈一鸣",
"hot_num": 4887538
},
{
"title": "洪欣否认与张丹峰离婚",
"hot_num": 4872596
},
{
"title": "小学生留字条后跳楼",
"hot_num": 4858426
},
{
"title": "国产鸡腿外包装阳性",
"hot_num": 4829587
},
{
"title": "火锅店检出禁用兽药",
"hot_num": 4801426
},
{
"title": "网曝王菲谢霆锋分手",
"hot_num": 4778371
},
{
"title": "糖水燕窝曝光者抑郁",
"hot_num": 4762748
}
]
}
}
}
影视
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "5935263900090481104",
"title": "霸总怎么都想不到,他随手救下的小孩,居然是他的亲儿子!",
"poster": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_small": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_big": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100",
"poster_pc": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "好剧渲染",
"play_url": "http://vd3.bdstatic.com/mda-kj6edbgpk3cs0qz4/cae_h264_nowatermark/1606875218/mda-kj6edbgpk3cs0qz4.mp4",
"playcnt": 549866,
"mthid": "1634935029156178",
"mthpic": "https://pic.rmb.bdstatic.com/bjh/user/94863b3c176d3223a379e0e206876aa0.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1059000036007127",
"site_name": null,
"duration": "10:00",
"url": "https://haokan.baidu.com/v?pd=pc&vid=5935263900090481104",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%25225935263900090481104%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/5149be5226f83954df8b41ac83a9b546.html",
"commentInfo": {
"source": "baidumedia",
"key": "1679857347109984154"
},
"comment_id": "1679857347109984154",
"show_tag": 0,
"publish_time": "2020年10月07日",
"new_cate_v2": "影视",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 7628,
"fmlike": "7628",
"comment": "0",
"fmcomment": "0次播放",
"fmplaycnt": "55万次播放",
"fmplaycnt_2": "55万",
"outstand_tag": ""
},
]
}
}
}
音乐
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "13103412906996698717",
"title": "福禄寿《马》太走心了!安静的歌曲,听完久久不能回神,超治愈!",
"poster": "https://videopic.bdstatic.com/hk/2009/1601129084ef608f311f92b2890366bed0c42af86b.jpg@s_2,w_454,h_256,q_100",
"poster_small": "https://videopic.bdstatic.com/hk/2009/1601129084ef608f311f92b2890366bed0c42af86b.jpg@s_2,w_454,h_256,q_100",
"poster_big": "https://videopic.bdstatic.com/hk/2009/1601129084ef608f311f92b2890366bed0c42af86b.jpg@s_2,w_681,h_381,q_100",
"poster_pc": "https://videopic.bdstatic.com/hk/2009/1601129084ef608f311f92b2890366bed0c42af86b.jpg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "爱奇艺乐队的夏天",
"play_url": "http://vd3.bdstatic.com/mda-kirx940rtix09ib7/mda-kirx940rtix09ib7.mp4?playlist=%5B%22hd%22%2C%22sc%22%5D",
"playcnt": 212436,
"mthid": "12664",
"mthpic": "http://pic.rmb.bdstatic.com/5f08c783c187a4e9a1f95815b8fde48d.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1084000035700070",
"site_name": null,
"duration": "00:59",
"url": "https://haokan.baidu.com/v?pd=pc&vid=13103412906996698717",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%252213103412906996698717%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/a85ad5b2e0b4413f95dc7b2dee4b603e.html",
"commentInfo": {
"source": "baidumedia",
"key": "1678905779785808740"
},
"comment_id": "1678905779785808740",
"show_tag": 0,
"publish_time": "2020年09月26日",
"new_cate_v2": "音乐",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 1692,
"fmlike": "1692",
"comment": "16",
"fmcomment": "16",
"fmplaycnt": "21万次播放",
"fmplaycnt_2": "21万",
"outstand_tag": ""
},
]
}
}
}
VLOG
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "11150808719024871032",
"title": "团团为销售老家苹果,连续熬夜40多天,今天抽时间吃顿大餐",
"poster": "https://tukuimg.bdstatic.com/processed/1fe8eaced360f31d7d5d704e8309977d.jpeg@s_2,w_454,h_256,q_100",
"poster_small": "https://tukuimg.bdstatic.com/processed/1fe8eaced360f31d7d5d704e8309977d.jpeg@s_2,w_454,h_256,q_100",
"poster_big": "https://tukuimg.bdstatic.com/processed/1fe8eaced360f31d7d5d704e8309977d.jpeg@s_2,w_681,h_381,q_100",
"poster_pc": "https://tukuimg.bdstatic.com/processed/1fe8eaced360f31d7d5d704e8309977d.jpeg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "洋芋团团",
"play_url": "http://vd4.bdstatic.com/mda-kjpri2zv4gwzxhky/cae_h264_nowatermark/1603535578/mda-kjpri2zv4gwzxhky.mp4",
"playcnt": 8100,
"mthid": "1584642338507543",
"mthpic": "https://pic.rmb.bdstatic.com/1be6d7231c516392ee6ae12a6d10b720.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1114000036445963",
"site_name": null,
"duration": "03:27",
"url": "https://haokan.baidu.com/v?pd=pc&vid=11150808719024871032",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%252211150808719024871032%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/6cf029cde06e261d25b6fce4f811c07d.html",
"commentInfo": {
"source": "baidumedia",
"key": "1681428601773893025"
},
"comment_id": "1681428601773893025",
"show_tag": 0,
"publish_time": "2020年10月24日",
"new_cate_v2": "三农",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 87,
"fmlike": "87",
"comment": "13",
"fmcomment": "13",
"fmplaycnt": "8100次播放",
"fmplaycnt_2": "8100",
"outstand_tag": ""
}
]
}
}
}
游戏
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "11279734394442989839",
"title": "一个八指战神的故事(2)",
"poster": "https://tukuimg.bdstatic.com/processed/e9c03109f8bf0fd33af33b99a155584c.jpg@s_2,w_454,h_256,q_100",
"poster_small": "https://tukuimg.bdstatic.com/processed/e9c03109f8bf0fd33af33b99a155584c.jpg@s_2,w_454,h_256,q_100",
"poster_big": "https://tukuimg.bdstatic.com/processed/e9c03109f8bf0fd33af33b99a155584c.jpg@s_2,w_681,h_381,q_100",
"poster_pc": "https://tukuimg.bdstatic.com/processed/e9c03109f8bf0fd33af33b99a155584c.jpg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "优秀的雪龙",
"play_url": "http://vd2.bdstatic.com/mda-kj345n8vym5y2v72/cae_h264_nowatermark/mda-kj345n8vym5y2v72.mp4",
"playcnt": 73738,
"mthid": "1660958739212018",
"mthpic": "https://pic.rmb.bdstatic.com/bjh/user/ee10bdfb53f1a32b353622cee349959d.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1098000035949434",
"site_name": null,
"duration": "01:36",
"url": "https://haokan.baidu.com/v?pd=pc&vid=11279734394442989839",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%252211279734394442989839%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/dc0133f86d8529b627ae8f46b26d3c1f.html",
"commentInfo": {
"source": "baidumedia",
"key": "1679558218788024090"
},
"comment_id": "1679558218788024090",
"show_tag": 0,
"publish_time": "2020年10月04日",
"new_cate_v2": "游戏",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 1606,
"fmlike": "1606",
"comment": "53",
"fmcomment": "53",
"fmplaycnt": "7.4万次播放",
"fmplaycnt_2": "7.4万",
"outstand_tag": ""
},
]
}
}
}
标签
推荐 影视 音乐 VLOG 游戏 搞笑 综艺 娱乐 动漫 生活 广场舞 美食 宠物 三农 军事 更多
分析发现,这些标签就是URL中对应的tab字段
之后我们只需要爬取首页的数据就可以获取到这些tab对应的值。使用xpath解析出来对应的数据,存储起来在后面使用
爬取整个网站的部分数据
- 爬取首页https://haokan.baidu.com/的数据,并通过xpath解析出对应的tab信息
- 根据我们的分析结果,使用tab信息拼接对应的tab下获取数据的URL:https://haokan.baidu.com/videoui/api/videorec?tab=TABNAME&act=pcFeed&pd=pc&num=15&shuaxin_id=1608126162900
- 根据上面的URL只能抓取到一批数据,如何抓取不同批次的数据呢,观察URL,发现最后的一个参数是时间戳,实际测试发现变更时间戳就可以获取到另一批数据了
这一节就先到这里,下一节分析数据存储,以及下载存储到本地
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 地球OL攻略 —— 某应届生求职总结
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 提示词工程——AI应用必不可少的技术
· .NET周刊【3月第1期 2025-03-02】