爬取笔趣阁_完本书籍
爬取地址
import os, time, shutil, requests, sqlite3
from bs4 import BeautifulSoup
from threading import Thread
from datetime import datetime
def fun_makedir(file_path):
"""
创建文件夹,并进入该文件夹
:return:
"""
if not os.path.exists(file_path):
os.makedirs(file_path)
os.chdir(file_path)
def main():
"""
主函数
:return:
"""
create_db() # 创建数据库
start = datetime.now()
book_urls = []
url = "http://www.biquge.tv/wanben/1_1" # 抓取入口url
book_urls = get_book(url) # 获取书名、书的url
total_book = len(book_urls)
threads = []
book_id = 0
for item in book_urls:
book_id += 1
t = Thread(target=save_book_todb, args=(book_id, item[0], item[1]))
threads.append(t)
thread_no = 0
for t in threads:
thread_no += 1
print("下载进度:[{}{}]".format(">" * (thread_no), "." * (total_book - thread_no))) # 打印进度条
t.start() # 开始线程
time.sleep(5) # print(delay)
time.sleep(delay) # 抓取一本书5000章节,大概需要80秒。delay=80
for t in threads:
t.join()
print(t)
print("\n共抓取{}部小说\t".format(count))
run_time = (datetime.now() - start).total_seconds()
print("总共用时{}秒".format(run_time), end="\t")
print("{}正在导出小说".format(">" * 100))
show_books()
def create_db():
"""
创建数据库
:return:
"""
if os.path.exists(dbname): os.remove(dbname)
conn = sqlite3.connect(dbname)
conn.close()
def set_delay(total_chapter):
"""
根据章节数量,设置延迟时间
:param total_chapter:
:return:
"""
base = 30
if total_chapter > 5000:
delay = base * 6
elif total_chapter > 4000:
delay = base * 5
elif total_chapter > 3000:
delay = base * 4
elif total_chapter > 2000:
delay = base * 3
elif total_chapter > 1000:
delay = base * 2
else:
delay = base
return delay
def get_book(url):
"""
获取书名、书的地址
:param url:
:return:(书名,书的地址)
"""
books = []
response = requests.get(url, headers=headers)
response.encoding = "gbk"
soup = BeautifulSoup(response.text, "html.parser")
book_txts = soup.find('div', class_='r').findAll('li')
for book in book_txts:
book_url = book.find('a')['href']
book_name = book.find('a').get_text()
# print("{:<40s}{:<60s}".format(book_name, book_url))
books.append([book_name, book_url])
return books
def get_chapter(book_url):
"""
获取章节名、章节地址
:param book_url:
:return:章节名、章节地址
"""
chapters = []
chapter_res = requests.get(book_url, headers=headers)
chapter_res.encoding = 'gbk'
chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
chs = chapter_soup.find('div', id="list").findAll('dd')
max_chapter = len(chs)
for i in range(9, max_chapter):
chapter = chs[i].find('a')
chapter_url = "http://www.biquge.tv" + chapter['href']
chapter_name = chapter.get_text()
chapters.append([chapter_name, chapter_url])
return chapters
def save_book_todb(book_id, book_name, book_url):
"""
获取书籍数据,保存到数据库
:param book_id:
:param book_name:
:param book_url:
:return:
"""
global count, delay
count = count + 1
chapters = []
chapters = get_chapter(book_url) # 获取所有章节
total_chapter = len(chapters)
delay = set_delay(total_chapter) # 设置休息时间
print("正在下载----小说 {}{:<2s}{},共有{}章节,请等待{}秒".format('>' * 50, str(book_id), book_name, total_chapter, delay))
chapter_id = 0
threads = []
for item in chapters:
chapter_id += 1
t = Thread(target=save_chapter_todb, args=(chapter_id, item[0], item[1], book_id, book_name, book_url))
threads.append(t) # save_chapter_todb(chapter_id, item[0], item[1],book_id, book_name, book_url)
for t in threads:
t.start()
time.sleep(0.01)
for t in threads:
t.join()
print("下载完成 {}{:<2s}{},共有{}章节".format('*' * 30, str(book_id), book_name, len(chapters)))
def save_chapter_todb(chapter_id, chapter_name, chapter_url, book_id, book_name, book_url):
"""
获取章节内容,并保存到数据库
:param chapter_id:
:param chapter_name:
:param chapter_url:
:param book_id:
:param book_name:
:param book_url:
:return:
"""
down_chapter_res = requests.get(chapter_url, headers=headers)
down_chapter_res.encoding = 'gbk'
down_chapter_soup = BeautifulSoup(down_chapter_res.text, "html.parser")
chapter_text = down_chapter_soup.find('div', id="content")
chapter_text = chapter_text.text # 获取html中的文本
chapter_texts = ""
for s in chapter_text.splitlines(
True): # 去除空行,去除每行的单引号 # chapter_text = "".join([s for s in chapter_text.splitlines(True) if s.strip()])
s.strip() # 替换空格和空行
s = s.replace("'", "''") # 单引号替换为双引号
chapter_texts += s
chapter_text = chapter_texts
save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url)
def save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url):
"""
保存数据到数据库
:param chapter_id:
:param chapter_name:
:param chapter_text:
:param chapter_url:
:param book_id:
:param book_name:
:param book_url:
:return:
"""
try:
table_name = create_table_book(book_id)
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
sql = "insert into " + table_name + " values('%d','%s','%s','%s','%s','%s','%s')" % (
chapter_id, chapter_name, chapter_text, chapter_url, table_name, book_name, book_url)
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
except:
print(
"保存章节出错 {}书名:{},章节{:<4s}{},章节链接:{}".format('.' * 10, book_name, str(chapter_id), chapter_name, chapter_url))
def create_table_book(table_id):
"""
创建表
:param table_id:
:return:表名
"""
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
table_name = "book_" + str(table_id)
sql = "create table IF NOT EXISTS " + table_name + "(chapter_id int,chapter_name varchar(20)," + \
"chapter_text varchar(10000),chapter_url varchar(60),book_id varchar(20),book_name varchar(100),book_url varchar(100))" # 表不存在,就创建;存在就跳过
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return table_name
def show_books():
"""
导出数据库中所有书籍
:return:
"""
start = datetime.now()
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
cursor.execute("select count(*) from sqlite_master where tbl_name like 'book_%'")
num = cursor.fetchone()
cursor.close()
conn.close()
total = num[0] + 1 # print(num[0])
threads = []
count = 0
for i in range(1, total):
t = Thread(target=show_book, args=(i,)) # show_book(i)
threads.append(t)
count += 1
for t in threads:
t.start()
time.sleep(0.1)
for t in threads:
t.join()
run_time = (datetime.now() - start).total_seconds()
print("\n\n导出小说完成,共导出{}部小说。".format(count), end="\t")
print("花费{}秒。".format(run_time), end="\n\n")
def show_book(table_id):
"""
导出单本书籍
:param table_id:
:return:
"""
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
sql = "select book_id,book_name,chapter_name,chapter_text,chapter_id,book_url from book_" + str(
table_id) + " order by chapter_id"
cursor.execute(sql)
results = cursor.fetchall()
file_name = results[0][0] + " " + results[0][1] + "[共" + str(len(results)) + "章]" + ".txt"
if os.path.exists(file_name): os.remove(file_name)
is_first = True
print("正在导出小说>>>{}".format(file_name))
for r in results:
with open(file_name, 'a', encoding='utf-8') as f:
if is_first: # 首行写入书名
f.write("{}完本小说{}\n\n{}**【{}】**\n{}共有{}章节\n{}在线阅读网址:{}\n\n{}"\
.format("*"*40,"*"*40," "*20,results[0][1]," "*20, len(results), " "*20,results[0][5],"*"*90))
is_first = False
f.write("{}第{}章:{}{}".format("\n\n", r[4], r[2], "\n\n")) # 循环写入各个章节
f.write(r[3])
cursor.close()
conn.close()
if __name__ == '__main__': # 程序入口
global category, save_path, headers, count, dbname, delay
category = "笔趣阁"
save_path = os.getcwd() + '/down/' + category
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
dbname = "笔趣阁小说_" + time.strftime("%Y-%m-%d", time.localtime()) + ".sqlite"
count = 0
delay = 20 # 休息时间,单位秒
fun_makedir(save_path) # 创建文件夹
# main() # 执行主函数
show_books()