python抓取小说——练习

action.py

import requests
from lxml import etree
class Action:

    # 抓取章节列表页数据
    def getList(book_link):
        r = requests.get(book_link)
        r.encoding = "gbk"
        html = etree.HTML(r.text)
        list_links = html.xpath('//div[@class="ml_list"]/ul/li/a/@href')
        book_name = html.xpath('//div[@class="introduce"]/h1/text()')[0]
        return {"book_name":book_name,"list_links":list_links}

    # 抓取章节内容页数据
    def getContent(page_link):
        r = requests.get(page_link)
        r.encoding = "gbk"
        html = etree.HTML(r.text)
        title = html.xpath('//div[@class="nr_title"]/h3/text()')
        contents = html.xpath('//div[@class="articlecontent"]/p/text()')
        content = ""
        for p in contents:
            content = content + p + "\r\n"+"\r\n"
        return {"title":title[0],"content":content}

 

 save.py

class Save:
    # 把内容写入文本文件
    def saveTxt(file_name,txt_content):
        with open(file_name,"wb") as f:
            f.write(txt_content.encode("gbk"))

 

tools.py

import os
import glob
class Tools:
    # 创建文件夹
    # 遇到重复文件夹命名为文件夹目录_1(2,3,4……)
    # 返回文件夹目录名称
    def mkdir(path,root_flag=False):
        folder = os.path.exists(path)
        floder_path = path
        if not folder:
            os.makedirs(path)
        else:
            if not root_flag:
                num_p = 1
                sub_path = glob.glob(path + '*')
                if sub_path:
                    # 最后一个创建目录
                    last_path = sub_path[-1]
                    floder_path = last_path + '_{}'.format(num_p)
                    if last_path.find('_') > 0:
                        num_str = last_path.split('_')
                        if num_str[-1].isdigit():
                            num_p = int(num_str[-1]) + 1
                            floder_path = last_path[0:last_path.rfind(
                                '_')] + '_{}'.format(num_p)
                            os.makedirs(floder_path)
                        else:
                            os.makedirs(floder_path)
                    else:
                        os.makedirs(floder_path)
        return floder_path

 

main.py

from module.action import Action as action
from module.save import Save as save
from module.tools import Tools as tools

def main():
    book_link = input("请输入书籍封面页链接:")
    if book_link == "":
        return
    # 获取列表数据
    l_data = action.getList(book_link)
    tools.mkdir("book/"+l_data["book_name"])
    # 循环下载章节
    for link in l_data["list_links"]:
        p_data = action.getContent("https://www.00ksw.com"+link)
        save_path = "book/"+l_data["book_name"]+"/"+p_data["title"]+".txt"
        save.saveTxt(save_path,p_data["content"])
        print(p_data["title"]+"------抓取完成")


if __name__ == "__main__":
    main()

 

运行:

 

posted @ 2022-04-05 13:20  波罗斯の程序日记  阅读(83)  评论(0编辑  收藏  举报