浏览器小说爬虫

在外网偷偷爬点小说资源

import puppeteer from 'puppeteer-core'
import fs from 'fs'

(async () => {
    const userAgent="Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
    const browser = await puppeteer.launch({
        // args: [`--user-agent=${userAgent}`],
        headless: false,
        userDataDir:'/Users/caoke/chromedefault',
        'executablePath':'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
    });
    const page = await browser.newPage();
// page.setUserAgent(userAgent)
// page.setViewport({
//     width:390,
//     height:844,
// })
    page.setViewport({
        width:1024,
        height:768,
    })
    const data={}
    let url='https://www.xxx.com/novel/intro?id=1018852882323214336'
    await page.goto(url);
    data.url=url;

    await page.waitForSelector('#root > div.intro_box > div.content_box > div.novel_box > ul.intro > li:nth-child(3) > div')
    data.name = await page.$eval('#root > div.intro_box > div.content_box > div.novel_box > ul.intro > li:nth-child(1) > div.info > div.title > h1', el => el.innerText);
    data.author = await page.$eval('#root > div.intro_box > div.content_box > div.novel_box > ul.intro > li:nth-child(1) > div.info > div.author', el => el.innerText);
    data.tags = await page.$eval('#root > div.intro_box > div.content_box > div.novel_box > ul.intro > li:nth-child(1) > div.info > div.tag', el => {
        const tags = []
        el.querySelectorAll('a').forEach(function (node){
            tags.push(node.innerText)
        })
        return tags
    });
    data.intro = await page.$eval('#root > div.intro_box > div.content_box > div.novel_box > ul.intro > li:nth-child(3) > div > h2', el => el.innerText);
    console.log(data)

    data.list=[]
    const nList = await page.$eval('#root > div.intro_box > div.content_box > div.novel_box > ul.catalog', el => {
        const nodeList=el.querySelectorAll('a')
        const ans=[]
        nodeList.forEach(node => {
            ans.push({
                href:node.href,
                title:node.innerText,
            })
        })
        return ans
    });

    for(let i=0;i<nList.length;i++){
        const url=nList[i].href
        await page.goto(url);
        const item={}
        await page.waitForSelector('#content > div.chapter > div.title_box > span')

        item.title = await page.$eval('#content > div.chapter > div.title_box > span', el => el.innerText);
        item.info = await page.$eval('.article', el => el.innerText);
        data.list.push(item)
        console.log(data)
    }
    fs.writeFileSync(data.name+'.json',JSON.stringify(data,null,2))

// await browser.close();
})();

 

posted @ 2024-08-08 21:58  无工时代  阅读(42)  评论(0)    收藏  举报