【爬虫】糗事百科爬虫
糗事百科爬虫
--------------------------------------------------------------------------------------------------------
写这个爬虫花了我相当相当多的时间,因为总是爬着爬着就看这糗事百科上的段子去了。。。。
--------------------------------------------------------------------------------------------------------
环境:python 3.6
import csv import json import random import requests from bs4 import BeautifulSoup class Qiushibaike(object): def __init__(self): # 作为保存爬取信息的容器 self.agent_info_list = [] # 作为待会儿拼接URL的一部分 self.home_url = 'https://www.qiushibaike.com/article/' # 开始的url self.base_url = 'https://www.qiushibaike.com/8hr/page/1/' # headers中的user-agent列表,每次请求url随机从中取出一个 self.headers_Value = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36 gzip, deflate, br', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'] # 请求头 self.headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Host':'www.qiushibaike.com','Proxy-Connection':'keep-alive','Referer':'https://www.baidu.com','User-Agent':random.choice(self.headers_Value)} def star_requests(self,base_url): # 请求开始的url 返回其content star_html = requests.get(base_url,headers=self.headers).content return star_html def nalysis_data(self,star_html,page): while 1: # 翻页处理 star_url = 'https://www.qiushibaike.com/8hr/page/%d/' % page # 生成beautifusoup格式 star_soup = BeautifulSoup(star_html, 'lxml') # 确定最大页数,最大页数为 class为'pagination'的ul标签中倒数第二‘li’标签 max_page = star_soup.find('ul', class_='pagination').find_all('li')[-2].text.strip() # 最大页数处理 if page <= int(max_page): qiushi_html = requests.get(star_url,headers=self.headers).content qiushi_soup = BeautifulSoup(qiushi_html,'lxml') Details_list = qiushi_soup.select('#content-left .article') # 取出所有糗事的详情页,因为个别糗事文字过多显示不全。 for details in Details_list: details_url = self.home_url + details['id'].replace('qiushi_tag_','') # 请求 糗事详情页 details_html = requests.get(details_url,headers=self.headers).content details_soup = BeautifulSoup(details_html,'lxml') # 保存信息,糗事详情的字典 agent_info_dict = {} agent_info_dict['用户名'] = details_soup.find('div',class_='author clearfix').find('h2').text.strip() gender_div = details_soup.find('div', 'articleGender') # 个别糗事 用户为匿名,这里默认年龄为未知,性别为保密 if gender_div: agent_info_dict['年龄'] = gender_div.text.strip() if gender_div['class'][1].replace('Icon','') == 'man': agent_info_dict['性别'] = '男' else: agent_info_dict['性别'] = '女' else: agent_info_dict['性别'] = '保密' agent_info_dict['年龄'] = '未知' agent_info_dict['内容'] = details_soup.find('div',class_='content').text.strip() agent_info_dict['点赞数'] = details_soup.find('span','stats-vote').find('i').text.strip() # 将每一个糗事详细的地点添加到 详情列表中 self.agent_info_list.append(agent_info_dict) print("第 %d 页保存完毕" % page) # 页数加1 page += 1 # 如果页数不满足 <=13 则爬虫结束 else: break def keep_file(self): json.dump(self.agent_info_list,open('qiushibaike.json','w')) def json_to_cv(self): # 1.读取json文件 json_file = open('qiushibaike.json', 'r') # 2.创建csv文件对象 csv_file = open('qiushibaike.csv', 'w') # 3.创建写入器 csv_witer = csv.writer(csv_file) data_list = json.load(json_file) # 4. 提取表头 sheet_title = data_list[0].keys() # 5. 提取内容 content_list = [] for dict_data in data_list: content_list.append(dict_data.values()) # 6.写入表头 csv_witer.writerow(sheet_title) # 7.写入内容 csv_witer.writerows(content_list) # 8.关闭文件 csv_file.close() json_file.close() def run(self): star_html = self.star_requests(self.base_url) self.nalysis_data(star_html,1) self.keep_file() self.json_to_cv() if __name__ == '__main__': star = Qiushibaike() star.run()
直接运行,就可爬取糗事百科13页段子,保存在csv文件中。