随笔- 137 文章- 0 评论- 34 阅读- 97万

使用node.js如何爬取网站数据

　　数据库又不会弄，只能扒扒别人的数据了。

　　搭建环境：

　　（1）、创建一个文件夹，进入并初始化一个package.json文件。

　　 npm init -y

　　（2）、安装相关依赖：

　　 npm install --save koa npm install --save cheerio // 后面会用到，用于抓取页面模块，为服务器特别定制的，快速、灵活、实施的jQuery核心实现

　　现在来一个简单的demo热热身。直接上代码（文件名：demo2.js）：

var http = require('http') // Node.js提供了http模块，用于搭建HTTP服务端和客户端
var url = 'http://www.m4yy.com/type/2.html' //输入任何网址都可以
 
http.get(url,function(res){  //发送get请求
  var html=''
  res.on('data',function(data){
    html += data  //字符串的拼接
  })
  res.on('end',function(){
    console.log(html)
    })
}).on('error',function(){
  console.log('获取资源出错！')
})

　　执行node demo2.js 得到结果如下：

　　很神奇有木有。然而这不是我们想要的，我们想要的是获取页面里面某一部份的数据。

　　这里以没事影院的电视剧页面为例。这时候cheerio就派上用场了，前面已经安装过，这里就不再赘述，它的用法其实跟jquery是很相似的。参考 cheerio（百度百科的解释）

　　在刚刚的js文件中引入cheerio模块，然后加载所需要的html内容。

　　var $ = cheerio.load(html) // 加载需要的html

　　为了方便使用，这里封装一个函数：

function filterChapters(html) {
  var $ = cheerio.load(html)  // 加载需要的html，然后就可以愉快地使用类似jQuery的语法了
  var chapters = $('.movie-item')  //在html里寻找需要的资源的class
  var courseData = [] // 创建一个数组，用来保存资源
  chapters.each(function(item, index) {  //遍历我们的html文档
      var chapter = $(this)
      var chapterTitle = chapter.children('a').attr('title') 
      var tvUrl = chapter.children('a').attr('href').split('show/')[1]
      var imgUrl = chapter.find('img').attr('src')
      var updateStatus = chapter.find('.hdtag').text()
      var type = chapter.find('.otherinfo a').text()
      var url = `http://www.m4yy.com/show/${tvUrl}`  
      courseData.push({
        chapterTitle: chapterTitle,
        tvUrl: tvUrl,
        imgUrl: imgUrl,
        updateStatus: updateStatus,
        type: type,
        url: url
      })
  })
  return courseData //返回需要的资源
}

　　现在将上面的demo2.js文件稍作修改。完整代码如下：

var http = require('http') // Node.js提供了http模块，用于搭建HTTP服务端和客户端
var url = 'http://www.m4yy.com/type/2-3.html' //输入任何网址都可以
var cheerio = require('cheerio') // 抓取页面模块，为服务器特别定制的，快速、灵活、实施的jQuery核心实现

http.get(url,function(res){  //发送get请求
  var html=''
  res.on('data',function(data){
    html += data  //字符串的拼接
  })
  res.on('end',function(){
    var courseData = filterChapters(html)
    console.log('courseData', courseData)
  })
}).on('error',function(){
  console.log('获取资源出错！')
})

function filterChapters(html) {
  var $ = cheerio.load(html)  // 加载需要的html
  var chapters = $('.movie-item')  //在html里寻找需要的资源的class
  var courseData = [] // 创建一个数组，用来保存资源
  chapters.each(function(item, index) {  //遍历html文档
      var chapter = $(this)
      var chapterTitle = chapter.children('a').attr('title') 
      var tvUrl = chapter.children('a').attr('href').split('show/')[1]
      var imgUrl = chapter.find('img').attr('src')
      var updateStatus = chapter.find('.hdtag').text()
      var type = chapter.find('.otherinfo a').text()
      var url = `http://www.m4yy.com/show/${tvUrl}`
      courseData.push({
        chapterTitle: chapterTitle,
        tvUrl: tvUrl,
        imgUrl: imgUrl,
        updateStatus: updateStatus,
        type: type,
        url: url
      })
  })
  return courseData //返回需要的资源
}

　　再次执行node demo2.js 此时结果如下：

　　光这样还不够，我们想把它存在一个json文件中，下面就新建一个tvList.json文件。将上面获取的内容添加到json文件中，这里就涉及到文件的写操作了。

　　需要用到模块fs，因此在js文件中引入fs模块。完整代码如下：　

var http = require('http') // Node.js提供了http模块，用于搭建HTTP服务端和客户端
var url = 'http://www.m4yy.com/type/2.html' //输入任何网址都可以
var cheerio = require('cheerio') // 抓取页面模块，为服务器特别定制的，快速、灵活、实施的jQuery核心实现
var fs = require("fs")

http.get(url,function(res){  //发送get请求
  var html=''
  res.on('data',function(data){
    html += data  //字符串的拼接
  })
  res.on('end',function(){
    var courseData = filterChapters(html)
    let content = courseData.map((o)=>{
       return JSON.stringify(o) // JSON.stringify() 方法用于将 JavaScript 值转换为 JSON 字符串。
    })

    fs.writeFile('./tvlist.json',content, function(err){ //文件路经，写入的内容，回调函数
      if(err) throw new Error ('写文件失败'+err);
      console.log("成功写入文件")
    })
  })
}).on('error',function(){
  console.log('获取资源出错！')
})

function filterChapters(html) {
  var $ = cheerio.load(html)  // 加载需要的html
  var chapters = $('.movie-item')  //在html里寻找需要的资源的class
  var courseData = [] // 创建一个数组，用来保存资源
  chapters.each(function(item, index) {  //遍历html文档
      var chapter = $(this)
      var chapterTitle = chapter.children('a').attr('title') 
      var tvUrl = chapter.children('a').attr('href').split('show/')[1]
      var imgUrl = chapter.find('img').attr('src')
      var updateStatus = chapter.find('.hdtag').text()
      var type = chapter.find('.otherinfo a').text()
      var url = `http://www.m4yy.com/show/${tvUrl}`
      courseData.push({
        chapterTitle: chapterTitle,
        tvUrl: tvUrl,
        imgUrl: imgUrl,
        updateStatus: updateStatus,
        type: type,
        url: url
      })
  })
  return courseData //返回需要的资源
}