爬虫小案例——爬取校花网
爬取校花网图片
# 页面路由规律 # href = "http://www.xiaohuar.com/list-1-0.html" 第一页 # href = "http://www.xiaohuar.com/list-1-1.html" 第二页 # href = "http://www.xiaohuar.com/list-1-2.html" 第三页 # href = "http://www.xiaohuar.com/list-1-3.html" 第四页 # 生成所有的页码 def get_page_url(): for i in range(2): yield 'http://www.xiaohuar.com/list-1-{}.html'.format(i) # for url in get_page_url(): # print(url) from requests_html import HTMLSession import os session = HTMLSession() # 第一页解析测试 # url = 'http://www.xiaohuar.com/list-1-0.html' # r = session.request(method='get', url=url, headers=headers) # # print(r.text) # img_element_list = r.html.find('[class="img"] img') # # print(img_element_list) # for img_element in img_element_list: # print(img_element.attrs.get('alt')) # print(r.html.base_url[:-1] + img_element.attrs.get('src')) # 解析页面,获取图片名和url def parse_page(url): r = session.request(method='get', url=url) img_element_list = r.html.find('[class="img"] img') for img_element in img_element_list: file_name = img_element.attrs.get('alt').replace('/', '').replace('\\', '') + '.png' print(file_name) file_url = img_element.attrs.get('src') file_url = r.html.base_url[:-1] + file_url if not file_url.startswith('http') else file_url # 处理相对路径和绝对路径 save_file(file_name, file_url) def save_file(name, url): base_path = '校花图片' file_path = os.path.join(base_path, name) r = session.get(url=url) with open(file_path, 'wb') as f: f.write(r.content) print('%s下载成功' % name) if __name__ == '__main__': for page_url in get_page_url(): parse_page(page_url)
爬取校花网视频
# 页面路由规律 # http://www.xiaohuar.com/list-3-0.html 第一页 # http://www.xiaohuar.com/list-3-1.html 第二页 # http://www.xiaohuar.com/list-3-2.html 第三页 # http://www.xiaohuar.com/list-3-3.html 第四页 # http://www.xiaohuar.com/list-3-4.html 第五页 # http://www.xiaohuar.com/list-3-5.html 第六页 from requests_html import HTMLSession import os session = HTMLSession() # 获取索引页url def get_index_page(): for i in range(6): url = 'http://www.xiaohuar.com/list-3-%s.html' % i yield url # 解析索引页测试 # url = 'http://www.xiaohuar.com/list-3-5.html' # r = session.get(url=url) # # print(r.html.find('#images a[class="imglink"]')) # for element in r.html.find('#images a[class="imglink"]'): # print(element.attrs.get('href')) # 解析索引页获取详情页url def get_detail_page(url): r = session.get(url=url) for element in r.html.find('#images a[class="imglink"]'): print(element.attrs.get('href')) yield element.attrs.get('href') # 测试解析详情页获取视频url,名字 # url = 'http://www.xiaohuar.com/p-3-13.html' # # url = 'http://www.xiaohuar.com/p-3-5.html' # r = session.get(url=url) # r.html.encoding = 'gbk' # file_name = r.html .find('title', first=True).text.replace('\\', '') # # print(file_name) # # element = r.html.find('#media source', first=True) # if element: # video_url = element.attrs.get('src') # print(video_url) # else: # video_url = r.html.search('var vHLSurl = "{}";')[0] # print(video_url) # 解析详情页获取视频url,名字 def get_url_name(url): r = session.get(url=url) r.html.encoding = 'gbk' file_name = r.html.find('title', first=True).text.replace('\\', '') print(file_name) element = r.html.find('#media source', first=True) if element: video_url = element.attrs.get('src') video_type = 'mp4' else: video_url = r.html.search('var vHLSurl = "{}";')[0] video_type = 'm3u8' return file_name, video_url, video_type # 保存文件 def save(file_name, video_url, video_type): if video_type == 'mp4': file_name += '.mp4' r = session.get(url=video_url) with open(file_name, 'wb') as f: f.write(r.content) elif video_type == 'm3u8': save_m3u8(file_name, video_url) # 处理m3u8 def save_m3u8(file_name, video_url): if not os.path.exists(file_name): os.mkdir(file_name) r = session.get(url=video_url) m3u8_path = os.path.join(file_name, 'playlist.m3u8') with open(m3u8_path, 'wb') as f: f.write(r.content) # print(r.text) for line in r.text: if line.endswith('ts'): ts_url = video_url.replace('playlist.m3u8', line) ts_path = os.path.join(file_name, line) r1 = session.get(url=ts_url) with open(ts_path, 'wb') as f: f.write(r1.content) if __name__ == '__main__': for index_page in get_index_page(): for detail_url in get_detail_page(index_page): file_name, video_url, video_type = get_url_name(detail_url) save(file_name, video_url, video_type)