我也来写个小爬虫 ^_^
今天下班抽了点时间看了下印象笔记,整理了一个礼拜node的api笔记。。。。然后去慕课网看了Scott老师讲的node系列视频教程。于是自己写了一个小小的爬虫,爬的是自己写的博客章节 ,里面的一些es6语法和api我就不一一细说,大家可以去看文档,http://nodeapi.ucdok.com/#/api/,好了话不多说,直接上代码
'use strict'; { const http = require(`http`); const cheerio = require(`cheerio`); const fs = require(`fs`); let url = `http://www.cnblogs.com/tween`; http.get(url, (res) => { let content = ``; res.on(`data`, (data) => { content += data; }).on(`end`, () => { let html = getContent(content); creatTxt(html); }); }).on(`error`,() => console.log(`获取数据失败`)); let creatTxt = content => { let txt = ``; for(let v of content){ txt += v.time; let blog = v.blog; for(let v of blog){ let json = v; for(let name in json){ txt += json[name]; } txt += `\n`; } } fs.writeFile(`blog.txt`,txt,'utf-8',(err) => { err?console.log(err):console.log(`写入成功`); }); }; let getContent = content => { let $ = cheerio.load(content); let blogs = $(`.day`); let arr = []; blogs.each( (index, item) => { let _this = $(item); let time = _this.find(`.dayTitle`).text(); let indexBlog = []; _this.find(`.postTitle`).each((index, item) => { let title = $(item).text().trim(); let list = _this.find(`.postDesc`).eq(index).text(); let read = list.match(/\(\d+\)/g)[0].trim(); let comment = list.match(/\(\d+\)/g)[1].trim(); indexBlog[index] = { title:`\t${title}\n`, read:`\t阅读:${read} 评论:${comment}\n`, }; }); arr[index] = { time:`${index+1} 、${time.trim()}\n`, blog:indexBlog }; }); return arr; }; }
运行后会在同目录下创建个blog.txt,里面的内容就是爬到的数据
若需转载,请注明出处,谢谢合作!