使用Nodejs实现的小说爬虫

 1 //引入模块
 2 const http = require('http')
 3 const fs = require('fs')
 4 const cheerio = require('cheerio')
 5 const iconv = require('iconv-lite')
 6 //第一章url
 7 const url = 'http://www.81zw.com/book/8634/745331.html'
 8 //开始章节数
 9 let i = 1
10 //最大获取章节数
11 let num = 100
12 
13 function main(url) {
14     startRequest(url)
15 }
16 
17 function startRequest(url) {
18     http.get(url, res => {
19         //定义空数组存放html
20         const html = []
21         res.on('data', (chunk) => {
22             //把数据块添加进数组
23             html.push(chunk)
24         })
25         res.on('end', () => {
26             //获取数据完毕后，使用iconv-lite转码，decedo中为Buffer对象，Buffer.concat为数组
27             const html1 = iconv.decode(Buffer.concat(html), 'gbk')
28             //使用cheerio解析html，cheerio模块的语法跟jQuery基本一样
29             const $ = cheerio.load(html1, {decodeEntities: false})
30             //处理数据
31             const title = $('.bookname h1').text()
32             const arr = []
33             const content = $("#content").html()
34             //分析结构后分割html
35             const contentArr = content.split('<br><br>')
36             contentArr.forEach(elem => {
37                 //去除内容的两端空格和&nbsp;
38                 const data = trim(elem.toString())
39                 arr.push(data)
40             })
41             const bookName = $(".con_top a").eq(2).text()
42             //定义存入数据库的对象
43             const obj = {
44                 id: i,
45                 err: 0,
46                 bookName: bookName,
47                 title: title,
48                 content: arr
49             }
50 
51             let url2 = url.split('/')[url.split('/').length - 2]
52             const link = $(".bottem2 a").eq(2).attr('href')
53             //获取当前章节的下一章地址，递归调用fetchPage
54             const nextLink = `http://www.81zw.com/book/${url2}/${link}`
55             saveContent(obj, nextLink)
56             console.log(`第${i + 1}章：${nextLink}`)
57             i++
58             if (i <= num) {
59                 setTimeout(() => {
60                     main(nextLink)
61                 }, 100)
62             }
63         })
64     })
65 }
66 
67 function saveContent(obj, nextLink) {
68     console.log(`${i}--${obj.title}`)
69     //判断书名文件夹是否存在，不存在则创建
70     if (!fs.existsSync(`data/${obj.bookName}`)) {
71         fs.mkdirSync(`data/${obj.bookName}`)
72     }
73     //写入json文件
74     fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), 'utf-8', err => {
75         if (err) throw err
76     })
77 }
78 
79 function trim(str) {
80     return str.replace(/(^\s*)|(\s*$)/g, '').replace(/&nbsp;/g, '')
81 }
82 
83 main(url)
生成文件
posted @ 2017-04-23 23:26 tgxh 阅读(1895) 评论(0) 编辑收藏举报
刷新页面返回顶部
tgxh的博客

I Love It

使用Nodejs实现的小说爬虫

公告