从零开始--02-数据下载
数据抓取
上一节中,我们分析了网站的url,可以抓取视频的数据以及热词数据(搜索框提示数据)
URL分析
分析一下视频数据的url
url = 'https://haokan.baidu.com/videoui/api/videorec?tab=yingshi&act=pcFeed&pd=pc&num=20&shuaxin_id=1608125768624'
其中的tab是我们在首页看到的视频分类标签的拼音缩写,后面的shuaxin_id看起来是一个时间戳,用来充当一个随机数,num表示一次获取几条数据
JSON数据格式分析
上一节中,我们从URL中获取到的数据中,有几个公共的字段
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "5935263900090481104",
"title": "霸总怎么都想不到,他随手救下的小孩,居然是他的亲儿子!",
"poster": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_small": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_big": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100",
"poster_pc": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "好剧渲染",
"play_url": "http://vd3.bdstatic.com/mda-kj6edbgpk3cs0qz4/cae_h264_nowatermark/1606875218/mda-kj6edbgpk3cs0qz4.mp4",
"playcnt": 549866,
"mthid": "1634935029156178",
"mthpic": "https://pic.rmb.bdstatic.com/bjh/user/94863b3c176d3223a379e0e206876aa0.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1059000036007127",
"site_name": null,
"duration": "10:00",
"url": "https://haokan.baidu.com/v?pd=pc&vid=5935263900090481104",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%25225935263900090481104%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/5149be5226f83954df8b41ac83a9b546.html",
"commentInfo": {
"source": "baidumedia",
"key": "1679857347109984154"
},
"comment_id": "1679857347109984154",
"show_tag": 0,
"publish_time": "2020年10月07日",
"new_cate_v2": "影视",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 7628,
"fmlike": "7628",
"comment": "0",
"fmcomment": "0次播放",
"fmplaycnt": "55万次播放",
"fmplaycnt_2": "55万",
"outstand_tag": ""
},
]
}
}
}
其中的errno表示错误码,0表示没有出错,error表示错误信息,data是一个json对象,里面存储我们请求的数据和参数信息,我们用到的在data对象的response中,所以我们会做一下数据的基础封装。
响应数据的封装
基础响应数据
class BaseData:
"""
响应数据
requestParam list对象
response 响应数据
"""
def __init__(self,data):
self.requestParam = data.get("requestParam")
self.response= data.get("response")
def __repr__(self):
return "<Data>[%s,%s]" %(self.requestParam,self.response)
class BaseResponse:
"""
响应数据的基础类
"""
def __init__(self,errno,error,data):
self.errno = errno
self.error = error
self.data = data
def __repr__(self):
return "<Bese>[%d,%s,%s]" %(self.errno,self.error,self.data)
热词响应数据
class Hotword:
"""
热词
"""
def __init__(self,title,hotNum):
self.title = title
self.hotNum = hotNum
def __repr__(self):
return "<Hotword>[%s,%s]" %(self.title,self.hotNum)
视频响应数据
class CommentInfo:
"""
视频评论信息
"""
def __init__(self, data):
self.source = data["source"]
self.key = data["key"]
def __repr__(self):
return "<CommentInfo>[%s,%s]" % (self.source, self.key)
class VideoBean:
"""
视频数据
"""
def __init__(self, data):
self.id = data["id"]
self.title = data["title"]
self.poster = data["poster"]
self.poster_small = data["poster_small"]
self.poster_big = data["poster_big"]
self.source_name = data["source_name"]
self.poster_pc = data["poster_pc"]
self.play_url = data["play_url"]
self.mthid = data["mthid"]
self.playcnt = data["playcnt"]
self.mthpic = data["mthpic"]
self.threadId = data["threadId"]
self.site_name = data["site_name"]
self.duration = data["duration"]
self.url = data["url"]
self.cmd = data["cmd"]
self.loc_id = data["loc_id"]
self.comment_id = data["comment_id"]
self.show_tag = data["show_tag"]
self.publish_time = data["publish_time"]
self.new_cate_v2 = data["new_cate_v2"]
self.appid = data["appid"]
self.channel_name = data["channel_name"]
self.channel_total_number = data["channel_total_number"]
self.channel_poster = data["channel_poster"]
self.fmlike = data["fmlike"]
self.comment = data["comment"]
self.fmcomment = data["fmcomment"]
self.fmplaycnt = data["fmplaycnt"]
self.fmplaycnt_2 = data["fmplaycnt_2"]
self.outstand_tag = data["outstand_tag"]
self.commentInfo = CommentInfo(data["commentInfo"])
获取数据
import requests
import time
from bean.Bean import BaseResponse
from bean.Bean import Hotword
from bean.Bean import VideoBean
from bean.Bean import BaseData
def do_net(url, headers=None):
"""
获取网络数据,返回base对象
:param url:
:param headers:
:return:
"""
if not headers:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url=url, headers=headers).json()
base = parse_bean(resp)
return base
def parse_bean(data):
base = BaseResponse(-1, "", "")
base.errno = data.get("errno")
base.error = data.get("error")
base.data = BaseData(data.get("data"))
return base
def get_hot_words():
url = 'https://haokan.baidu.com/videoui/api/hotwords?sfrom=pc'
resp_bean = do_net(url=url)
if resp_bean.errno != 0:
print("获取数据失败!数据为:", resp_bean)
return
hot_words = []
data = resp_bean.data.response.get("hotwords")
for item in data:
hot_word = Hotword(item["title"], item["hot_num"])
hot_words.append(hot_word)
print("获取热词成功", hot_words)
def get_video_data():
url = "https://haokan.baidu.com/videoui/api/videorec?tab=yinyue&act=pcFeed&pd=pc&num=5&shuaxin_id=%d".format(
(int)(time.time() * 1000))
base = do_net(url)
if base.errno != 0:
print("获取数据失败!数据为:", base)
return
videos = []
for item in base.data.response.get("videos"):
videos.append(VideoBean(item))
print(len(videos))
print(videos)
if __name__ == '__main__':
get_hot_words()
get_video_data()
总结
由于从首页抓取视频分类失败,后面研究后再补上。
下一节,将我们抓取的数据写入数据库中
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 地球OL攻略 —— 某应届生求职总结
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 提示词工程——AI应用必不可少的技术
· .NET周刊【3月第1期 2025-03-02】