爬虫 - 动态分页抓取 游民星空 的资讯 - bs4
# coding=utf-8 # !/usr/bin/env python ''' author: dangxusheng desc : 动态分页抓取 游民星空 的资讯 date : 2018-08-29 ''' import requests from bs4 import BeautifulSoup import json import time url = "https://www.gamersky.com/news/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1", "Referer": "https://www.gamersky.com/news/" } # 获取每一页 def once_page_info(page_index=1): time_stramp = str(time.time()).replace('.', '')[0:13] time_stramp = str(time_stramp) # 分页提取 url = "https://db2.gamersky.com/LabelJsonpAjax.aspx?callback=jQuery18308266280560965529_1541308409652&jsondata=%7B%22type%22%3A%22updatenodelabel%22%2C%22isCache%22%3Atrue%2C%22cacheTime%22%3A60%2C%22nodeId%22%3A%2211007%22%2C%22isNodeId%22%3A%22true%22%2C%22page%22%3A" + str( page_index) + "%7D&_=" + time_stramp r = requests.get(url, headers=headers) # 返回回来的数据,内部是json字符串格式,但是开头和结尾有一部分干扰字符串,去除即可 now_page_html = json.loads(r.text[41:-2])['body'] soup = BeautifulSoup(now_page_html, 'html.parser') # ul = soup.find('ul', attrs={"class": "pictxt contentpaging"}) li_list = soup.find_all('li') ls = [] for once_li in li_list: once_type = once_li.find('a', attrs={'class': 'dh'}).string once_type = once_type if once_type != None else "暂无类型" once_title = once_li.find('a', attrs={'class': 'tt'}).string once_title = once_title if once_title != None else "暂无标题" once_info = once_li.find('div', attrs={'class': 'txt'}).string once_info = once_info if once_info != None else "暂无简介" once_time = once_li.find('div', attrs={'class': 'time'}).string once_visited = once_li.find('div', attrs={'class': 'visit gshit'}).string once_comment = once_li.find('div', attrs={'class': 'pls cy_comment'}).string once_img_url = once_li.find('img', attrs={'class': 'pe_u_thumb'}).attrs['src'] ls.append( {'type': once_type, 'title': once_title, 'info': once_info, 'time': once_time, 'visited': once_visited, 'comment': once_comment, 'img_url': once_img_url}) return ls # 保存每一个的内容 def save_to_file(all_info): with open('./gemersky.txt', 'a', encoding='utf-8') as file: for o in all_info: # 按照指定格式保存 file.write("%s::%s::%s::%s::%s::%s::%s\n"%(o['type'],o['title'],o['time'],o['visited'],o['comment'],o['img_url'],o['info'])) for i in range(1, 10): page_info = once_page_info(i) save_to_file(page_info) print('第%i页下载完成' % i)