用node爬取网页上的内容

如何用node简单的爬取网页上的内容:

1.安装express以及生成器

express官网:http://www.expressjs.com.cn/

  npm install express --save

  npm install express-generator -g

2.用生成器创建新Express应用,进入项目并安装依赖包

  express myapp

  cd myapp

  npm install

3.安装superagent

superagent官网:http://visionmedia.github.io/superagent/

npm install superagent

4.安装cheerio

cheerio官网:https://cheerio.js.org/

npm install cheerio

5.在routes文件夹下新建路由文件news.js

var express = require("express");
const cheerio = require('cheerio');
const superagent = require('superagent');
var router = express.Router();

router.get('/', function (req, res, next) {
  // 抓取内容
  superagent.get('http://www.donews.com/')
    .end(function (err, sres) {
      if (err) {
        return next(err);
      }
      var $ = cheerio.load(sres.text);
      var items = [];
      $('div.block h3.block a').each(function (idx, element) {
        var $element = $(element);
        items.push({
          title: $element.text(),
          href: $element.attr('href')
        });
      });
      res.send(items);
    });
});

module.exports = router;

superagent.get('抓取网页的地址')

网页的 html 内容存储在 sres.text 里面

cheerio.load 加载得到的html内容并赋给变量 $

后面选择需要的内容部分语法和jQuery选择器基本一致,选择需要的元素进行遍历

然后返回遍历的内容

6.在app.js中引入路由文件

var createError = require('http-errors');
var express = require('express');
var path = require('path');
var cookieParser = require('cookie-parser');
var logger = require('morgan');

var newsRouter = require('./routes/news');

var app = express();// 创建实例

var myLogger = function (req, res, next) {
  console.log('LOGGED');
  next();
}

var requestTime = function (req, res, next) {
  req.requestTime = Date.now();
  console.log(req.requestTime);
  next();
}

// view engine setup
app.set('views', path.join(__dirname, 'views'));
app.set('view engine', 'pug');

app.use(logger('dev'));
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(cookieParser());
app.use(express.static(path.join(__dirname, 'public')));// 将 public 目录下的图片、CSS 文件、JavaScript 文件对外开放访问(此写法为绝对路径)

app.use(myLogger);
app.use(requestTime);

app.use('/news', newsRouter);


//设置跨域请求
app.use('*', function (req, res, next) {
  res.header("Access-Control-Allow-Origin", "*");
  res.header('Access-Control-Allow-Headers', 'Content-Type, Content-Length, Authorization, Accept, X-Requested-With , yourHeaderFeild');
  res.header("Access-Control-Allow-Methods", "PUT,POST,GET,DELETE,OPTIONS");
  res.header("X-Powered-By", ' 3.2.1')
  res.header("Content-Type", "application/json;charset=utf-8");
  next();
});

// catch 404 and forward to error handler
app.use(function(req, res, next) {
  next(createError(404));
});

// error handler
app.use(function(err, req, res, next) {
  // set locals, only providing error in development
  res.locals.message = err.message;
  res.locals.error = req.app.get('env') === 'development' ? err : {};

  // render the error page
  res.status(err.status || 500);
  res.render('error');
});

module.exports = app;

引入路由的代码:

var newsRouter = require('./routes/news');

app.use('/news', newsRouter);

7.运行

npm start

浏览器打开项目即可看到爬取的数据

posted @ 2019-11-07 13:55  Li_pk  阅读(738)  评论(0编辑  收藏  举报