爬虫小案例——爬取校花网
爬取校花网图片
# 页面路由规律 # href = "http://www.xiaohuar.com/list-1-0.html" 第一页 # href = "http://www.xiaohuar.com/list-1-1.html" 第二页 # href = "http://www.xiaohuar.com/list-1-2.html" 第三页 # href = "http://www.xiaohuar.com/list-1-3.html" 第四页 # 生成所有的页码 def get_page_url(): for i in range(2): yield 'http://www.xiaohuar.com/list-1-{}.html'.format(i) # for url in get_page_url(): # print(url) from requests_html import HTMLSession import os session = HTMLSession() # 第一页解析测试 # url = 'http://www.xiaohuar.com/list-1-0.html' # r = session.request(method='get', url=url, headers=headers) # # print(r.text) # img_element_list = r.html.find('[class="img"] img') # # print(img_element_list) # for img_element in img_element_list: # print(img_element.attrs.get('alt')) # print(r.html.base_url[:-1] + img_element.attrs.get('src')) # 解析页面,获取图片名和url def parse_page(url): r = session.request(method='get', url=url) img_element_list = r.html.find('[class="img"] img') for img_element in img_element_list: file_name = img_element.attrs.get('alt').replace('/', '').replace('\\', '') + '.png' print(file_name) file_url = img_element.attrs.get('src') file_url = r.html.base_url[:-1] + file_url if not file_url.startswith('http') else file_url # 处理相对路径和绝对路径 save_file(file_name, file_url) def save_file(name, url): base_path = '校花图片' file_path = os.path.join(base_path, name) r = session.get(url=url) with open(file_path, 'wb') as f: f.write(r.content) print('%s下载成功' % name) if __name__ == '__main__': for page_url in get_page_url(): parse_page(page_url)
爬取校花网视频
# 页面路由规律 # http://www.xiaohuar.com/list-3-0.html 第一页 # http://www.xiaohuar.com/list-3-1.html 第二页 # http://www.xiaohuar.com/list-3-2.html 第三页 # http://www.xiaohuar.com/list-3-3.html 第四页 # http://www.xiaohuar.com/list-3-4.html 第五页 # http://www.xiaohuar.com/list-3-5.html 第六页 from requests_html import HTMLSession import os session = HTMLSession() # 获取索引页url def get_index_page(): for i in range(6): url = 'http://www.xiaohuar.com/list-3-%s.html' % i yield url # 解析索引页测试 # url = 'http://www.xiaohuar.com/list-3-5.html' # r = session.get(url=url) # # print(r.html.find('#images a[class="imglink"]')) # for element in r.html.find('#images a[class="imglink"]'): # print(element.attrs.get('href')) # 解析索引页获取详情页url def get_detail_page(url): r = session.get(url=url) for element in r.html.find('#images a[class="imglink"]'): print(element.attrs.get('href')) yield element.attrs.get('href') # 测试解析详情页获取视频url,名字 # url = 'http://www.xiaohuar.com/p-3-13.html' # # url = 'http://www.xiaohuar.com/p-3-5.html' # r = session.get(url=url) # r.html.encoding = 'gbk' # file_name = r.html .find('title', first=True).text.replace('\\', '') # # print(file_name) # # element = r.html.find('#media source', first=True) # if element: # video_url = element.attrs.get('src') # print(video_url) # else: # video_url = r.html.search('var vHLSurl = "{}";')[0] # print(video_url) # 解析详情页获取视频url,名字 def get_url_name(url): r = session.get(url=url) r.html.encoding = 'gbk' file_name = r.html.find('title', first=True).text.replace('\\', '') print(file_name) element = r.html.find('#media source', first=True) if element: video_url = element.attrs.get('src') video_type = 'mp4' else: video_url = r.html.search('var vHLSurl = "{}";')[0] video_type = 'm3u8' return file_name, video_url, video_type # 保存文件 def save(file_name, video_url, video_type): if video_type == 'mp4': file_name += '.mp4' r = session.get(url=video_url) with open(file_name, 'wb') as f: f.write(r.content) elif video_type == 'm3u8': save_m3u8(file_name, video_url) # 处理m3u8 def save_m3u8(file_name, video_url): if not os.path.exists(file_name): os.mkdir(file_name) r = session.get(url=video_url) m3u8_path = os.path.join(file_name, 'playlist.m3u8') with open(m3u8_path, 'wb') as f: f.write(r.content) # print(r.text) for line in r.text: if line.endswith('ts'): ts_url = video_url.replace('playlist.m3u8', line) ts_path = os.path.join(file_name, line) r1 = session.get(url=ts_url) with open(ts_path, 'wb') as f: f.write(r1.content) if __name__ == '__main__': for index_page in get_index_page(): for detail_url in get_detail_page(index_page): file_name, video_url, video_type = get_url_name(detail_url) save(file_name, video_url, video_type)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 使用C#创建一个MCP客户端
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列1:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现