使用Nodejs 爬虫爬取网站内容
要会的知识点
jquery $第二个参数
1.jQuery(selector, [context])
这种用法,相当于 $(context).find(selector) 或者 context.find(selector)
依赖的库、需要用到的东西
const url = require('url'); const util = require('util'); const Promise = require('bluebird'); //bluebird(蓝鸟) 是一个第三方 Promise 规范实现库,它不仅完全兼容原生 Promise 对象,且比原生对象功能更强大。 const request = require('request'); //request是服务端发起请求的工具包 const iconv = require('iconv-lite'); //使用iconv-lite解决node当中不支持GBK编码的问题 const cheerio = require('cheerio'); // jquery核心功能,在服务端操作DOMconst sample = require('lodash/sample'); //从collection
(集合)中获得一个随机元素。const isEmpty = require('lodash/isEmpty'); //检查value
是否为一个空对象,集合,映射或者set。 isEmpty(null) => true ; true => true , 1 => true , [1,2,3] => false ,{'a':1} => true const Logger = require('../Logger'); //日志输出 const configs = require('../configs'); // 配置环境 const logger = new Logger('namespace'); // 伪装浏览器 const USER_AGENT = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', ]
Logger.js 日志输出文件
/** 日志输出接口 **/ const debug = require('debug'); // class Logger { constructor(namespace) { this.namespace = namespace; this.logger = debug('app'); } error(...arg) { this.logger.extend('error').extend(this.namespace)(...arg); } info(...arg) { const logger = this.logger.extend('info').extend(this.namespace); logger.log = console.log.bind(console); logger(...arg); } log(...arg) { this.info(...arg); } debug(...arg) { const logger = this.logger.extend('debug').extend(this.namespace); logger.log = console.log.bind(console); logger(...arg); } } module.exports = Logger;
configs.js 配置服务器
const md5 = require('md5'); const get = require('lodash/get'); // path defaultValue const env = get(process.env, 'NODE_ENV', 'development'); const development = { // 调用接口时,默认登录的用户和密码 apiRoot: 'http://127.0.0.1/xxx', sessionUser: 'admin', sessionPassword: md5('test1234567'), }; const production = { // 调用接口时,默认登录的用户和密码 apiRoot: 'http://127.0.0.1/xxx', sessionUser: 'admin', sessionPassword: md5('test1234567'), }; module.exports = env === 'production' ? production : development;