4、nodejs爬虫JSDOM(一)
1. 类库介绍
1. web项目,基于express4 ejs模板
2. 通过request抓取网页
3. 通过jQuery, jsdom, htmlparser提取网页内容
安装依赖包
npm install jsdom
npm install jquery
npm install xmlhttprequest
npm install request
npm install htmlparser
增加myUtil.js文件
var MyUtil = function () { }; var http = require('http'); var request = require('request'); MyUtil.prototype.get = function (url, callback) { request(url, function (error, response, body) { if (!error && response.statusCode == 200) { callback(body, response.statusCode); } }) } module.exports = new MyUtil();
修改控制器 routes/pachong.js
var express = require('express'); var router = express.Router(); var jsdom = require('jsdom'); var $ = require('jquery')(new jsdom.JSDOM().window); var myUtil = require('../public/lib/myUtil.js'); var url = "http://movie.douban.com/subject/11529526"; router.get("/", function (req, res, next) { myUtil.get(url, function (content, status) { console.log("status:=" + status); var movie = {} movie.name = $(content).find('span[property="v:itemreviewed"]').text(); movie.director = $(content).find('#info span:nth-child(1) a').text(); console.log(movie); var _temArr = []; var nameSize = $(content).find('span[class="name"] a'); for(var i = 0; i < nameSize.length; i++){ _temArr.push(nameSize[i].text); } console.log(">>>>>>>>>>>") console.log(_temArr) res.send(content); }); }); module.exports = router;
app.js
var pachongRouter = require('./routes/pachong')
app.use('/pachong', pachongRouter);
基本示例