07慕课网《进击Node.js基础(一)》HTTP小爬虫
获取HTML页面
var http = require('http') var url='http://www.imooc.com/learn/348' http.get(url,function(res){ var html = '' res.on('data',function(data){ html += data }) res.on('end',function(){ console.log(html) }) }).on('errer',function(){ console.log('出错')
对HTML页面处理
添加cheerio模块
可以像使用jQuery一样编码
# 安装到当前目录下
npm install cheerio
处理
var http = require('http') var cheerio = require('cheerio') var url='http://www.imooc.com/learn/348' function filterChapters(html){ var $ = cheerio.load(html) var chapters = $('.chapter') // [{ // chapterTitle:'', // videos:[{ // title:'', // id:'' // } // ] // }] var courseData = [] chapters.each(function(item){ var chapter = $(this) var chapterTitle = chapter.find('h3').text() var videos = chapter.find('.video').children('li') var chapterData = { chapterTitle :chapterTitle.trim(), videos:[] } videos.each(function(item){ var video = $(this).find('a') var videoTile = video.text() var id = video.attr('href').split('video/')[1] chapterData.videos.push({ title: videoTile.trim(), id : id }) }) courseData.push(chapterData) }) return courseData } function printCourseInfo(courseData){ courseData.forEach(function(item){ var chapterTitle = item.chapterTitle console.log(chapterTitle + '\n') item.videos.forEach(function(video){ console.log(' 【' + video.id + '】' + video.title + '\n') }) }) } http.get(url,function(res){ var html = '' res.on('data',function(data){ html += data }) res.on('end',function(){ var courseData = filterChapters(html) printCourseInfo(courseData) }) }).on('errer',function(){ console.log('出错') })
使用promise重写:
10慕课网《进击Node.js基础(一)》初识promise