4、nodejs爬虫JSDOM(一)

1. 类库介绍 

1. web项目,基于express4 ejs模板
2. 通过request抓取网页
3. 通过jQuery, jsdom, htmlparser提取网页内容

 

安装依赖包

 npm install jsdom

 

  npm install jquery
  npm install xmlhttprequest
  npm install request
  npm install htmlparser

 

增加myUtil.js文件
var MyUtil = function () {
};
var http = require('http');
var request = require('request');
MyUtil.prototype.get = function (url, callback) {
  request(url, function (error, response, body) {
    if (!error && response.statusCode == 200) {
      callback(body, response.statusCode);
    }
  })
}
module.exports = new MyUtil();

 

修改控制器 routes/pachong.js

var express = require('express');
var router = express.Router();

var jsdom = require('jsdom');
var $ = require('jquery')(new jsdom.JSDOM().window);

var myUtil = require('../public/lib/myUtil.js');

var url = "http://movie.douban.com/subject/11529526";


router.get("/", function (req, res, next) {
  myUtil.get(url, function (content, status) {
    console.log("status:=" + status);
    var movie = {}
    movie.name = $(content).find('span[property="v:itemreviewed"]').text();
    movie.director = $(content).find('#info span:nth-child(1) a').text();
    console.log(movie);

    var _temArr = []; 
    var nameSize = $(content).find('span[class="name"] a');
    for(var i = 0; i < nameSize.length; i++){
      _temArr.push(nameSize[i].text);
    }

    console.log(">>>>>>>>>>>")
    console.log(_temArr)


    res.send(content);
  });

});

module.exports = router;

 

app.js

var pachongRouter = require('./routes/pachong')

   app.use('/pachong', pachongRouter);

 

基本示例 

 

posted @ 2020-11-18 10:20  2020,hello  阅读(465)  评论(0编辑  收藏  举报