node爬虫进阶版
手写了一个方便爬虫的小库:
const url = require('url') const glib = require('zlib') //默认头部 const _default_headers = { 'Accept-Encoding': 'gzip, deflate, br', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } //options(url,method,header)--http头部信息 isDebug--是否开启调试状态 module.exports = function(options, isDebug) { if(typeof options === "string") { options = { url: options, method: 'GET', headers: {} } } else { options = options || {} options.method = options.method || 'GET' options.headers = options.headers || {} } options.headers = Object.assign(_default_headers, options.headers) function debug(msg) { if(isDebug) { console.log(msg) } } return new Promise((resolve, reject) => { req(options) function req(options) { //判断是http还是https let urlObj = url.parse(options.url) let mod = null port = 0 if(urlObj.protocol == 'https:') { mod = require('https') port = 443 } else { mod = require('http') port = 80 } let _req_options = { hostname: urlObj.hostname, port, path: urlObj.path, method: options.method, headers: options.headers } //开始模拟,爬取信息 let req_obj = mod.request(_req_options, (res) => { if(res.statusCode!==200) { //如果是重定向则重新在请求 if(res.statusCode == 301 || res.statusCode === 302) { options.url = res.headers.location debug('重定向: '+res.headers.location) req(options) } else { reject(res.statusCode) } } else { //statusCode是200时接受data buffer let data = [] res.on('data', buffer => { data.push(buffer) }) res.on('end', () =>{ let buffer = Buffer.concat(data) //判断是否传输有误 if (res.headers['content-length'] != buffer.length) { debug('收到数据有误,正在重新获取') req(options) } //判断是否有用gzip else if (res.headers['content-encoding'] && res.headers['content-encoding'].includes('gzip')) { buffer = glib.gunzip(buffer, (err,data) => { debug('gzip解压完成并成功返回') resolve(data) }) } else { debug('成功返回') resolve(buffer) } }) } }) req_obj.on('error', err => { debug('爬虫失败') reject(err) }) req_obj.end() } }) }
require进来然后传入url或者options,就可以得到爬虫后返回的promise了
举个例子:
我要爬个bilibili的视频:
const url = require('url') const fs = require('fs') function getVideo(options, headers, fileName) { if(typeof options === "string") { options = { url: options, method: 'GET', headers: {}, timeout: 2000 } } else { options = options || {} options.method = options.method || 'GET' options.headers = options.headers || {} options.timeout = options.timeout || 2000 } options.headers = headers return new Promise((resolve, reject) => { req(options) function req(options) { //判断是http还是https let urlObj = url.parse(options.url) let mod = null port = 0 if(urlObj.protocol == 'https:') { mod = require('https') port = 443 } else { mod = require('http') port = 80 } let _req_options = { hostname: urlObj.hostname, port, path: urlObj.path, method: options.method, headers: options.headers, timeout: options.timeout } //开始模拟,爬取信息 let req_obj = mod.request(_req_options, (res) => { // 视频路径 const filePath = `${__dirname}/${fileName}`; if (fs.existsSync(filePath)) { fs.unlinkSync(filePath) } res.on('data', buffer => { fs.appendFileSync(filePath, buffer) const size = fs.statSync(filePath).size; console.log(`已下载${(size / 1024 / 1024).toFixed(2)}MB,完成${(size/res.headers['content-length'] * 100).toFixed(2)}%`) }) res.on('end', () =>{ resolve() }) }) req_obj.on('error', err => { debug('爬虫失败') reject(err) }) req_obj.end() } }) } // 生成文件名 const fileName = '1.flv' // 链接 const videoUrl = 'https://cn-sdyt-cu-v-05.acgvideo.com/upgcxcode/66/83/34548366/34548366-1-64.flv?expires=1545405600&platform=pc&ssig=ElhY4A2e-U4R2m8EI1eiGQ&oi=1928611810&nfa=uTIiNt+AQjcYULykM2EttA==&dynamic=1&hfa=2116953847&hfb=Yjk5ZmZjM2M1YzY4ZjAwYTMzMTIzYmIyNWY4ODJkNWI=&trid=45c5fdc464354b71bf599c224b7df8ea&nfb=maPYqpoel5MI3qOUX6YpRA==&nfc=1'; // 头部 const header = { 'Origin': 'https://www.bilibili.com', 'Referer': 'https://www.bilibili.com/video/av21061574', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } getVideo(videoUrl, header, fileName).then(res => { console.log('写入成功'); })