rust 爬取笔趣阁生成epub文件

简单研究一下epub,毕竟txt总是看着不爽,后面在优化epub样式

cargo.toml
[package]
name = "bqg_epub"
version = "0.1.0"
edition = "2021"

[dependencies]
epub-builder = "0.7.4"
reqwest = {version = "0.12.5",features = ["blocking","json"]}
tokio = {version = "1.38.0",features = ["full"]}
scraper ="0.19.0"
rand = { version = "0.8.5", features = ["default"] }
url = "2.5.2"
clap = {version = "4.5.7",features = ["derive"]}
main.rs
use std::cmp::Ordering;

use std::fs::{File, OpenOptions};
use epub_builder::EpubBuilder;
use epub_builder::Result;
use epub_builder::ZipLibrary;
use epub_builder::EpubContent;
use epub_builder::ReferenceType;

use std::io::Write;
use std::path::Path;
use std::{fs, io};
use std::time::Duration;
use clap::Parser;
use reqwest::{Client, Url};
use scraper::{Html, Selector};
use rand::{Rng};


#[derive(Debug)]
struct Book {
    title: String,
    homepage: String,
    intro: String,
    author: String,

    chapters: Vec<Chapter>,
}

impl Book {
    fn new(homepage: &str) -> Self {
        Self {
            title: String::default(),
            author: String::default(),
            intro: String::default(),
            chapters: Vec::new(),
            homepage: homepage.to_string(),
        }
    }
    fn get_book_info(&mut self, text: &str) -> Result<()> {
        let mut chapters = vec![];
        let document = Html::parse_document(&text);
        let chapter_selector = Selector::parse("#list > dl > dd > a").unwrap();
        let author_selector = Selector::parse("#info > p:nth-child(2) > a").unwrap();
        let intro_selector = Selector::parse("#intro").unwrap();
        let title_selector = Selector::parse("#info > h1").unwrap();

        self.author = document.select(&author_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
        self.intro = document.select(&intro_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
        self.title = document.select(&title_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");

        for element in document.select(&chapter_selector) {
            if let Some(href) = element.value().attr("href") {
                let text = element.text().collect::<Vec<_>>().join(" ");

                let c = Chapter::new(href, &text);
                // chapters.push(c);
                self.add_chapter(c);
            }
        }
        chapters.sort();
        self.chapters = chapters;
        Ok(())
    }

    fn add_chapter(&mut self, chapter: Chapter) {
        if !self.chapters.iter().any(|c| c.href == chapter.href) {
            self.chapters.push(chapter)
        }
    }


    fn generate_epub(&self) -> Result<()> {
        // let mut output = Vec::<u8>::new();

        // todo: replace with real cover image
        let dummy_image = "Not really a PNG image";
        let css = "
body {
    font-family: serif;
    font-size: 1em;
    color: #000000;
    line-height: 1.5;
    margin: 0;
    padding: 0;
}

p {
    text-align: justify;
    text-indent: 1.5em;
    margin: 0 0 1em 0;
}

h1, h2, h3, h4, h5, h6 {
    font-family: sans-serif;
    margin-top: 1.5em;
    margin-bottom: 0.5em;
    color: #333333;
}

img {
    max-width: 100%;
    height: auto;
    display: block;
    margin: 1em auto;
}

a {
    color: #1e90ff;
    text-decoration: none;
}

a:hover {
    text-decoration: underline;
}

ul, ol {
    margin: 0 0 1em 1.5em;
    padding: 0;
}

li {
    margin-bottom: 0.5em;
}

blockquote {
    margin: 0 1.5em;
    padding: 0.5em;
    border-left: 5px solid #ccc;
    background: #f9f9f9;
}
";
        let mut output = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .open(format!("{}.epub", &self.title)).unwrap();

        // Create a new EpubBuilder using the zip library
        let mut builder = EpubBuilder::new( ZipLibrary::new()?)?;
        builder
            // Set some metadata
            .metadata("author", &self.author)?
            .metadata("title", &self.title)?
            .metadata("description", &self.intro)?
            .add_cover_image("cover.png", dummy_image.as_bytes(), "image/png")?
            // Add a resource that is not part of the linear document structure
            .add_resource("some_image.png", dummy_image.as_bytes(), "image/png")?
            .stylesheet(css.as_bytes())?
        ;

        for chapter in self.chapters.iter() {
            builder.add_content(EpubContent::new(&chapter.href, &*chapter.content.as_bytes())
                .title(&chapter.title)
                .reftype(ReferenceType::TitlePage))?;
        }

        builder.inline_toc()
            // Finally, write the EPUB file to a writer. It could be a `Vec<u8>`, a file,
            // `stdout` or whatever you like, it just needs to implement the `std::io::Write` trait.
            .generate(&mut output)?;


        Ok(())
    }
}

// const BASE_URL: &str = "https://www.xbiqugew.com/book/53099/";
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
#[command(next_line_help = true)]
struct Args {
    /// base_url
    #[arg(short, long)]
    url: String,
}


#[tokio::main]
async fn main() -> Result<()> {
    let args = Args::parse();
    let client = Client::builder()
        .build()?;
    let html = query_book_homepage(&client, &args.url).await.unwrap();

    let mut book = Book::new(&args.url);
    book.get_book_info(&html)?;

    for chapter in book.chapters.iter_mut() {
        println!("href: {}  | title: {}", chapter.href, chapter.title);
        let delay = random_delay();
        println!("Waiting for {} milliseconds before the next request...", delay.as_millis());
        tokio::time::sleep(delay).await;
        chapter.scraper_chapter_content(&book.homepage, &client).await.unwrap()
    }
    book.generate_epub().unwrap();

    println!("{:?}", book);
    Ok(())
}

/// test request page
async fn query_book_homepage(client: &Client, homepage: &str) -> Result<String> {
    let html = client.get(homepage).send().await?.text().await?;
    println!("scraper homepage: {} done!", homepage);
    Ok(html)
}

#[derive(Eq, Debug)]
struct Chapter {
    number: usize,
    href: String,
    title: String,
    content: String,
}

impl Chapter {
    fn new(href: &str, title: &str) -> Self {
        let number = href.split('.').next().unwrap_or("0").parse::<usize>().unwrap();
        Self {
            number,
            href: href.to_string(),
            title: title.to_string(),
            content: String::default(),
        }
    }

    async fn scraper_chapter_content(&mut self, base_url: &str, client: &Client) -> Result<()> {
        // let v = (rand::random::<f64>() * 5000.) as u64 ;
        //
        // let sleep_time = std::time::Duration::from_millis(v);
        let base_url = Url::parse(base_url)?;
        let joined_url = base_url.join(&self.href)?;

        println!("now visited: {}", joined_url);

        let page = client.get(joined_url).send().await?.text().await?;
        let document = Html::parse_document(&page);
        let content_selector = Selector::parse("#content").unwrap();

        let content = match document.select(&content_selector).next() {
            Some(e) => {
                e.text().collect::<Vec<_>>().join("\r\n")
            }
            None => { "this chapter may have no content ".to_string() }
        };

        // let file_name = format!("books/{}.txt", self.href.split('.').next().unwrap_or("0").parse::<usize>().unwrap());
        // let dir_path = Path::new(&file_name).parent().unwrap(); // Get the directory part of the file path

        // check_and_create_directory(dir_path)?;
        // let mut file = OpenOptions::new()
        //     .read(true)
        //     .write(true)
        //     .create(true)
        //     .open(file_name).unwrap();

        let cleaned = replace_html_entities(&content);
        // file.write(cleaned.as_bytes()).unwrap();

        self.content = cleaned;
        Ok(())
    }
}

impl PartialEq for Chapter {
    fn eq(&self, other: &Self) -> bool {
        self.href == other.href
    }
}

impl PartialOrd for Chapter {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for Chapter {
    fn cmp(&self, other: &Self) -> Ordering {
        self.number.cmp(&other.number)
    }
}


fn check_and_create_directory(dir_path: &Path) -> io::Result<()> {
    if !dir_path.exists() {
        println!("Directory does not exist. Creating directory: {:?}", dir_path);
        fs::create_dir_all(dir_path)?; // Create the directory and any missing parent directories
    } else {
        println!("Directory already exists: {:?}", dir_path);
    }
    Ok(())
}

fn random_delay() -> Duration {
    let mut rng = rand::thread_rng();
    let millis = rng.gen_range(500..2000); // Random delay between 500ms and 2000ms
    Duration::from_millis(millis)
}

fn replace_html_entities(s: &str) -> String {
    s.replace("&nbsp;", "")
        .replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
    // .replace(" "," ")
    // Add more replacements as needed
}
posted @ 2024-06-20 17:30  iTachiLEe  阅读(8)  评论(0编辑  收藏  举报