Python爬虫练习:抓取笔趣阁小说(一)
练习使用requests BeautifulSoup 抓取一本小说存放到D盘中
速度比较慢、抓取服务器容易中断
# -*- coding:UTF-8 -*- import requests from bs4 import BeautifulSoup import re """ 获取书籍目录 """ def getBookContents(url): req = requests.get(url=url) req.encoding = "gb2312" html = req.text dv_bf = BeautifulSoup(html, "html5lib") dv = dv_bf.find("div", class_="listmain") # dvs = dv_bf.find_all("div", class_="listmain") a_bf = BeautifulSoup(str(dv), "html5lib") a = a_bf.find_all("a") book_contents_list = [] i = 0 for content in a[13:]: book_title = content.string book_url = content.get("href") try: # 数据清洗 获取标题"章"字索引 若没有则出现异常 不记录数据 book_title_index = str(book_title).index("章", 0) # print(book_title_index) # 通过index切片 获取新的章节标题 new_book_title = book_title[book_title_index + 1:] # print(new_book_title) # 去除标题含有的空格 i = i + 1 new_book_titles = "第{}章".format(i) + new_book_title.lstrip() new_book_url = "http://www.biqukan.com{}".format(book_url) #print(new_book_titles, new_book_url) # 一组数据设置为字典类型 contenets = {new_book_titles: new_book_url} # 存放到list book_contents_list.append(contenets) except: # 通过异常捕捉,出现异常是没有找到"章"字符索引 print("*****************不是正文章节节点,不予记录****************") print("原标题=", book_title) print("原链接=", new_book_url) return book_contents_list """ 通过文章链接地址获取章节内容 """ def getConnect(url): target = 'http://www.biqukan.com/1_1094/5403177.html' req = requests.get(url=url) req.encoding = 'gb2312' html = req.text div_bf = BeautifulSoup(html, "html5lib") div = div_bf.find("div", id="content") # 去除script [s.extract() for s in div('script')] # print(div.text) return div.text """ 将小说内容写入到文件 """ def saveData(filepath, text): with open(filepath, mode="w", encoding="UTF-8") as f: f.writelines(text) f.write('\n\n') if __name__ == '__main__': book_list = getBookContents("http://www.biqukan.com/1_1094") for li in book_list: filepath = "d:\\123\\" connecturl = "" for aa in li.keys(): filepath = filepath+aa connecturl = li[aa] text = getConnect(connecturl) saveData(filepath,text)