爬取豆瓣图书
未运行出想要的结果
#-*- coding:UTF-8 -*-
import sys
import time
import urllib
import urllib3
import importlib
import requests
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import Workbook
importlib.reload(sys)
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
#function:获得标签为book_tag的所有图书信息
#book_tag:图书标签 例如:'个人管理'
def book_spider(book_tag):
#初始化页数为0
page_num = 0
#创建图书列表
book_list = []
#尝试次数初始化为0
try_times = 0
while(1):
# url='[图片]http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
url = 'http://www.douban.com/tag/'+urllib.request.quote(book_tag)+'/book?start='+str(page_num*15)
print("url is {}".format(url));
#休眠
time.sleep(np.random.rand()*5)
#Last Version
try:
req = urllib.request.Request(url,headers=hds[page_num%len(hds)])
#获取到的网页代码
source_code = urllib.request.urlopen(req).read()
#强制转换成字符串
plain_text=str(source_code)
except (urllib.error.HTTPError,urllib.error.URLError) as e:
print(e)
continue #url='[图片]http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test #url='[图片]http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
##Previous Version, IP is easy to be Forbidden
#source_code = requests.get(url)
#plain_text = source_code.text
soup = BeautifulSoup(plain_text)
list_soup = soup.find('div',{'class':'mod book-list'})
try_times+=1
if list_soup == None and try_times < 200:
continue
elif list_soup==None or len(list_soup)<=1:
break #Break When no information got after 200 times requesting
for book_info in list_soup.findAll('dd'):
title = book_info.find('a',{'class':'title'}).string.strip()
desc = book_info.find('div',{'class':'desc'}).string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a',{'class':'title'}).get('href')
try:
author_info = '作者/译者: '+'/'.join(desc_list[0:-3])
except:
author_info = '作者/译者: 暂无'
try:
pub_info = '出版信息:'+'/'.join(desc_list[-3:])
except:
pub_info = '出版信息: 暂无'
try:
rating = book_info.find('span',{'class':'rating_nums'}).string.strip()
except:
rating = '0.0'
try:
#people_num = book_info.findAll('span',{'class':'rating_nums'}).string.strip()
people_num = book_info.findAll('span')[2].string.strip()
people_num = get_people_num(book_url)
people_num = people_num.strip('人评价')
except:
people_num='0'
book_list.append([title,rating,people_num,author_info,pub_info])
try_times = 0 #set 0 when got valid information
page_num+=1
print("Downloading Information From Page {}".format(page_num))
print("现在的长度是",len(book_list))
return book_list
#function:获得评分等级
def get_people_num(url):
# url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test
try:
req = urllib.request.Request(url,headers=hds[np.random.randint(0,len(hds))])
source_code = urllib.request.urlopen(req).read()
plain_text = str(source_code)
except (urllib.error.HTTPError,urllib.error.URLError) as e:
print(e)
soup = BeautifulSoup(plain_text)
people_num = soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip()
return people_num
#function:获取所有对应标签的图书列表
#book_tag_lists ['个人管理', '时间管理', '投资', '文化', '宗教']
def do_spider(book_tag_lists):
#创建书单列表
book_lists=[]
#遍历图书标签列表
for book_tag in book_tag_lists:
#将单个图书标签传入book_spider函数中,获得相应标签的图书信息并赋值给book_list
book_list = book_spider(book_tag)
#给图书信息列表按照第二个关键字(rating)降序排序
book_list=sorted(book_list,key=lambda x:x[1],reverse=True)
#将排好序的图书加入到书单列表中
book_lists.append(book_list)
return book_list
#function:将图书信息读到excel中
def print_book_lists_excel(book_lists,book_tag_lists):
for nb in book_lists:
print("真的醉了",nb)
wb = Workbook()
ws = []
for i in range(len(book_tag_lists)):
ws.append(wb.create_sheet(title=book_tag_lists[i]))#utf8-》unicode
for i in range(len(book_tag_lists)):
#序号
count=1
print("len of book_lists is the ",len(book_lists))
for b1 in book_lists[i]:
ws[i].append(['序号','书名','评分','评价人数','作者','出版社'])
count+=1
for b1 in book_tag_lists[i]:
print("b0 is the ",b1[0])
print("len is the ",len(b1))
ws[i].append([count,b1[0],float(b1[1]),int(b1[2]),b1[3],b1[4]])
count+=1
save_path='book_list'
for i in range(len(book_tag_lists)):
save_path+=('-'+book_tag_lists[i])
save_path+='.xlsx'
print("这是我的名字:"+save_path)
wb.save(save_path)
#function:主函数
def main():
# book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史']
# book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教']
# book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性']
# book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网']
# book_tag_lists = ['数学']
# book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生']
# book_tag_lists = ['商业','理财','管理']
# book_tag_lists = ['名著']
# book_tag_lists = ['科普','经典','生活','心灵','文学']
# book_tag_lists = ['科幻','思维','金融']
#book_tag_lists = ['个人管理', '时间管理', '投资', '文化', '宗教']
book_tag_lists = [ '文化', ]
book_lists = do_spider(book_tag_lists)
print_book_lists_excel(book_lists, book_tag_lists)
main()
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 解答了困扰我五年的技术问题
· 为什么说在企业级应用开发中,后端往往是效率杀手?
· 用 C# 插值字符串处理器写一个 sscanf
· Java 中堆内存和栈内存上的数据分布和特点
· 开发中对象命名的一点思考
· DeepSeek 解答了困扰我五年的技术问题。时代确实变了!
· PPT革命!DeepSeek+Kimi=N小时工作5分钟完成?
· What?废柴, 还在本地部署DeepSeek吗?Are you kidding?
· DeepSeek企业级部署实战指南:从服务器选型到Dify私有化落地
· 程序员转型AI:行业分析
2018-07-04 Java 代码打印List中的类到Excel表中