Node.js简单爬虫的爬取,也是跟着慕课网上抄的,网站有一点点改动,粘上来好复习嘛
var http = require('http') var cheerio = require('cheerio') var url = 'http://www.imooc.com/learn/348' function filterChapters(html){ var $ = cheerio.load(html) var chapters =$('.chapter') // [{ // chapterTitle:'', // videos:[ // title:'', // id:'' // ] // }] // var courseData=[] chapters.each(function(item){ var chapter = $(this) var chapterTitle = chapter.find('h3').text() // var videos =chapter.find('.video').children('li') var videos =chapter.find('.video').children('li') var chapterData = { chapterTitle: chapterTitle, videos:[] } videos.each(function(item){ var video = $(this).find('.J-media-item') var videoTitle = video.text() // var id = video.attr('href').split('video/')[1] // var id = video.find('.data-media-id').text(); var id = video.attr('href').split('/video/')[1] chapterData.videos.push({ title: videoTitle, id: id }) }) courseData.push(chapterData) }) return courseData } function printCourseInfo(courseData){ courseData.forEach(function(item){ var chapterTitle = item.chapterTitle console.log(chapterTitle+ '\n') item.videos.forEach(function(video){ console.log('【'+video.id+'】'+video.title+'\n') }) }) } http.get(url, function(res){ var html = '' res.on('data', function(data){ html += data; }) res.on('end', function(){ // filterChapters(html) var courseData = filterChapters(html) //console.log(courseData+'finish'+'\n') printCourseInfo(courseData) }) }).on('error',function(){ console.log('获取课程数据出错') })
效果
数据还有一点没整理好得日后再弄
就是把不想要的也取回来了,现在还不懂怎么数据清洗干净,先记下来。