Python web crawler（2.1）多循环嵌套练习

写个函数，传入（书名:book，标题:tittle，内容:content），要求在book文件夹下（不存在则创建），创建每个tittle.txt文件，写入content内容

import os

def save_to_file(folder_book, title, content):
    # 如果文件夹不存在，则创建
    if not os.path.exists(folder_book):
        os.makedirs(folder_book)

    # 构建文件路径
    file_path = os.path.join(folder_path, f"{title}.txt")

    # 写入内容到文件
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# 示例用法
book_name = "MyBook"
chapter_title = "Chapter1"
chapter_content = "This is the content of Chapter 1."

save_to_file(book_name, chapter_title, chapter_content)

从某一个书的网站，写几个函数，分别获取目录中的（所有书、所有章节、所有内容），返回为字典，写入本地output.json中

import random
import time
import json  # 导入 json 模块

def get_html(url):
    # 实现 get_html 函数，返回一个 tree 对象
    # （此处应该包含网络请求和HTML解析的代码）
    pass
    return tree

def get_books(tree):
    # 实现 get_books 函数，返回一个字典，其中键是书名，值是书的URL
    # （此处应该包含从 tree 对象中提取书名和URL的代码）
    pass
    return {book_name: book_url}

def book_mulu(book_url):
    # 实现 book_mulu 函数，返回一个字典，其中键是章节标题，值是章节URL
    # （此处应该包含从 book_url 获取章节信息的代码）
    pass
    return {title: title_url}

def book_mulu_detail(title: title_url):
    # 实现 book_mulu_detail 函数，返回一个字典，其中键是标题，值是文本内容
    # （此处应该包含从 tree 对象中提取文本内容的代码）
    pass
    return {title: text}

def main():
    # 定义起始URL
    url = 'https://www.shu.com/book1'
    # 初始化一个空字典，用于存储最终结果
    result_dict = {}
    
    # 遍历每一本书，获取书名和对应的URL
    for book_name, book_url in get_books(get_html(url)).items():
        print(book_name)  # 打印当前处理的书名
        # 初始化一个空字典，用于存储当前书籍的章节和内容
        book_dict = {}
        # 遍历当前书籍的每个章节，获取章节标题和对应的URL
        for title, title_url in book_mulu(get_html(book_url)).items():
            # 获取当前章节的内容
            title_content = book_mulu_detail(title, get_html(title_url))
            # 更新当前书籍字典，将当前章节和内容添加进去
            book_dict.update(title_content)
            # 打印当前章节信息和下载完成提示
            print(title, title_url, "下载完成")
            # 随机等待一段时间，模拟人的操作，避免过于频繁的请求
            time.sleep(random.randint(1, 3))
            
        # 将当前书籍字典添加到最终结果字典中
        result_dict[book_name] = book_dict

    # 将最终结果字典以 JSON 格式写入文件
    with open('output.json', 'w', encoding='utf-8') as file:
        json.dump(result_dict, file, ensure_ascii=False, indent=2)

# 执行主题函数
if __name__ == '__main__':
    main()

把以下“目录字典”，

mulu_dict = {
    '书1': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书2': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书3': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书4': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'}
}

要读取 JSON 文件并将其转换为字典，可以使用json.load()函数。

import json

# 读取 JSON 文件
with open('output.json', 'r', encoding='utf-8') as file:
    mulu_dict = json.load(file)

# 打印加载的字典
print(mulu_dict_dict)

把下面目录字典，依次写入到\书X\章节X.text中

mulu_dict = {
    '书1': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书2': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书3': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书4': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'}
}

代码如下

import os

# 你的 result_dict 字典
mulu_dict = {
    '书1': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书2': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书3': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'},
    '书4': {'章节1': '内容1', '章节2': '内容2', '章节3': '内容3'}
}

def write_to_files(dictionary):
    for book_name, chapters in dictionary.items():
        # 如果文件夹不存在，则创建
        if not os.path.exists(book_name):
            os.makedir(book_name)
        print('准备写入：',book_name)

        for chapter_name, content in chapters.items():
            # 构建文件相对路径 /books/书名/章节名.txt
            file_path = os.path.join(book_dir, f"{chapter_name}.txt")
            # 写入标题和内容到文件中
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(chapter_name+'\n')
                file.write(content)
                print(chapter_name,'写入完成')

# 调用函数写入文件
write_to_files(mulu_dict)

posted @ 2024-01-25 18:45 Magiclala 阅读(46) 评论(0) 收藏举报

刷新页面返回顶部

Magiclala的博客

Python web crawler（2.1）多循环嵌套练习

公告