动态IP抓取快递信息
运营人员需要抓取快递信息,用的第三方的不太靠谱,自己前端遍历,发现每一个IP抓取50条,就被屏蔽了,也可以每秒5~6个慢慢抓,测试过不会被屏蔽
准备工作
用的是 云连HTTP代理 的每天10个的免费 IP地址
用的request去抓取信息和获取IP地址
前端对接 用的是koa2 ,koa-bodyparser处理post请求,koa2-cors处理前端请求跨域
动态IP设置是网上百度到了,自己做了修改
package
{ "name": "ip", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", "license": "ISC", "dependencies": { "bluebird": "^3.5.1", "koa": "^2.5.2", "koa-bodyparser": "^3.2.0", "koa2-cors": "^2.0.6", "query-string": "^6.1.0", "request": "^2.88.0" } }
一下是动态IP设置和快递信息请求
const request = require("request"); const Promise = require("bluebird"); const queryString = require('query-string'); const userAgents = [ 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6', 'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' ]; //这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存 const expiryTime = 1 * 60 * 1000;// 过期间隔时间,毫秒 let ips = null; //代理ip let time = Date.now();// 存储代理IP的时间,判断是否过期,如果过期重新请求 /** * 请求免费代理,可做缓存,这里就存在变量中,只做测试 */ const getProxyList = (flag) => { return new Promise((resolve, reject) => { const nowDate = Date.now(); /*if (!flag && nowDate - time < expiryTime) { console.log('直接return') resolve(ips); return; }*/
// 动态IP的接口 const apiURL = 'http://xx.xxx.xxx.xx:xxxx/Index-generate_api_url.html?packid=7&fa=5&qty=10&port=1&format=json&ss=5&css=&pro=&city='; const options = { method: 'GET', url: apiURL, gzip: true, encoding: null, }; request(options, (error, response, body) => { console.log('body:', JSON.parse(body.toString())); const ret = JSON.parse(body.toString()).success === 'false' ? ips : JSON.parse(body.toString()).data.map(res => res.IP + ':' + res.Port); ips = ret; console.log(ret) time = Date.now(); resolve(ret); }); }) } //爬取网页 async function reptile(data) { return new Promise((resolve, reject) => { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; let ip = ips[parseInt(Math.random() * ips.length)]; console.log('ip:', ip); let useIp = `http://${ip}`; const options = { method: 'GET', url: 'http://www.kuaidi100.com/query?' + queryString.stringify(data), gzip: true, encoding: null, headers: { 'User-Agent': userAgent, //动态设置浏览器头部信息 }, //proxy: useIp, //动态设置代理ip timeout: 8000 }; request(options, (error, response, body) => { //这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问 if (error) { console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`); // 如果是代理ip无法访问,另外选择一个代理 } else { console.log('爬取页面成功, √', body.toString()); console.log('爬取页面成功, √', data); } resolve(body) }) }); } //启动方法 async function startFun(data) { if (!ips) { await getProxyList(); } const body = await reptile(data);//爬取网页 if (!body || body.toString().split('非法访问:IP禁止访问').length > 1) { await getProxyList(true); return { code: 400, msg: '抓取失败' }; } //解析html return { code: 200, data: body.toString(), msg: '' }; } //启动方法 module.exports = startFun;
koa的代码
const Koa = require('koa') const app = new Koa() const startFun = require('./startFun') const bodyParser = require('koa-bodyparser') app.use(bodyParser()) app.use(async (ctx, next) => { // 允许来自所有域名请求 ctx.set("Access-Control-Allow-Origin", "*"); ctx.set("Access-Control-Allow-Methods", "OPTIONS, GET, PUT, POST, DELETE"); ctx.set("Access-Control-Allow-Headers", "x-requested-with, accept, origin, content-type"); ctx.set("Content-Type", "application/json;charset=utf-8"); ctx.set("Access-Control-Allow-Credentials", true); ctx.set("Access-Control-Max-Age", 300); ctx.set("Access-Control-Expose-Headers", "myData"); await next(); }) app.use(async (ctx) => { if (ctx.url && ctx.url.split('?')[0] === '/chakuaidi' && ctx.method === 'GET') { const data = await startFun(ctx.query); let postData = ctx.request.body ctx.body = data } }) app.listen(3000, () => { console.log('demo2 is run') })
前端代码
var a = ['xxxxxxxx', 'xxxxxxx', 'xxxxxx', ] function aa(index) { index = index || 0; if (index >= a.length) return; setTimeout(function () { $.get('http://xxx.xx.xx.xxx:xxxx/chakuaidi?type=yunda&postid=' + a[index] + '&temp=' + Math.random(), function (res) { if (res.code === 200) { res = res.data, res = JSON.parse(res); console.log((res.nu || a[index]) + ',' + index + ',' + (!res.data.length ? '暂无数据' : res.data[0].context)); aa(index + 1) } else { aa(index) } }) },1000) } console.log(a.length)
请求是比每秒查几个快了很多 发现到了150个 就开始报禁止IP ,应该是IP太少了,不过比一条一条差感觉快很多,量少推荐,
前端拿到数据 在写一个前端的导出 数据直接导出来 直接就可以用了