动态IP抓取快递信息

运营人员需要抓取快递信息,用的第三方的不太靠谱,自己前端遍历,发现每一个IP抓取50条,就被屏蔽了,也可以每秒5~6个慢慢抓,测试过不会被屏蔽
准备工作

用的是 云连HTTP代理 的每天10个的免费 IP地址

用的request去抓取信息和获取IP地址 
前端对接 用的是koa2 ,koa-bodyparser处理post请求,koa2-cors处理前端请求跨域

动态IP设置是网上百度到了,自己做了修改

 package

{
  "name": "ip",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "bluebird": "^3.5.1",
    "koa": "^2.5.2",
    "koa-bodyparser": "^3.2.0",
    "koa2-cors": "^2.0.6",
    "query-string": "^6.1.0",
    "request": "^2.88.0"
  }
}

 

一下是动态IP设置和快递信息请求

const request = require("request");

const Promise = require("bluebird");
const queryString = require('query-string');

const userAgents = [
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];
//这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存
const expiryTime = 1 * 60 * 1000;// 过期间隔时间,毫秒
let ips = null; //代理ip
let time = Date.now();// 存储代理IP的时间,判断是否过期,如果过期重新请求

/**
 * 请求免费代理,可做缓存,这里就存在变量中,只做测试
 */
const getProxyList = (flag) => {
    return new Promise((resolve, reject) => {
        const nowDate = Date.now();
        /*if (!flag && nowDate - time < expiryTime) {
            console.log('直接return')
            resolve(ips);
            return;
        }*/
      // 动态IP的接口 const apiURL = 'http://xx.xxx.xxx.xx:xxxx/Index-generate_api_url.html?packid=7&fa=5&qty=10&port=1&format=json&ss=5&css=&pro=&city='; const options = { method: 'GET', url: apiURL, gzip: true, encoding: null, }; request(options, (error, response, body) => { console.log('body:', JSON.parse(body.toString())); const ret = JSON.parse(body.toString()).success === 'false' ? ips : JSON.parse(body.toString()).data.map(res => res.IP + ':' + res.Port); ips = ret; console.log(ret) time = Date.now(); resolve(ret); }); }) } //爬取网页 async function reptile(data) { return new Promise((resolve, reject) => { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; let ip = ips[parseInt(Math.random() * ips.length)]; console.log('ip:', ip); let useIp = `http://${ip}`; const options = { method: 'GET', url: 'http://www.kuaidi100.com/query?' + queryString.stringify(data), gzip: true, encoding: null, headers: { 'User-Agent': userAgent, //动态设置浏览器头部信息 }, //proxy: useIp, //动态设置代理ip timeout: 8000 }; request(options, (error, response, body) => { //这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问 if (error) { console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`); // 如果是代理ip无法访问,另外选择一个代理 } else { console.log('爬取页面成功, √', body.toString()); console.log('爬取页面成功, √', data); } resolve(body) }) }); } //启动方法 async function startFun(data) { if (!ips) { await getProxyList(); } const body = await reptile(data);//爬取网页 if (!body || body.toString().split('非法访问:IP禁止访问').length > 1) { await getProxyList(true); return { code: 400, msg: '抓取失败' }; } //解析html return { code: 200, data: body.toString(), msg: '' }; } //启动方法 module.exports = startFun;

  

koa的代码

const Koa = require('koa')
const app = new Koa()
const startFun = require('./startFun')
const bodyParser = require('koa-bodyparser')

app.use(bodyParser())
app.use(async (ctx, next) => {
    // 允许来自所有域名请求
    ctx.set("Access-Control-Allow-Origin", "*");
    ctx.set("Access-Control-Allow-Methods", "OPTIONS, GET, PUT, POST, DELETE");
    ctx.set("Access-Control-Allow-Headers", "x-requested-with, accept, origin, content-type");
    ctx.set("Content-Type", "application/json;charset=utf-8");
    ctx.set("Access-Control-Allow-Credentials", true);
    ctx.set("Access-Control-Max-Age", 300);
    ctx.set("Access-Control-Expose-Headers", "myData");
    await next();
})
app.use(async (ctx) => {
     if (ctx.url && ctx.url.split('?')[0] === '/chakuaidi' && ctx.method === 'GET') {
        const data = await startFun(ctx.query);
        let postData = ctx.request.body
        ctx.body = data
    } 
})

app.listen(3000, () => {
    console.log('demo2 is run')
})

  前端代码

  var a = ['xxxxxxxx', 'xxxxxxx', 'xxxxxx', ]
        function aa(index) {
            index = index || 0;
            if (index >= a.length) return;
            setTimeout(function () {
                $.get('http://xxx.xx.xx.xxx:xxxx/chakuaidi?type=yunda&postid=' + a[index] + '&temp=' + Math.random(), function (res) {
                    if (res.code === 200) {
                        res = res.data,
                            res = JSON.parse(res);
                        console.log((res.nu || a[index]) + ',' + index + ',' + (!res.data.length ? '暂无数据' : res.data[0].context));
                        aa(index + 1)
                    } else {
                        aa(index)
                    }

                })
            },1000)
        }
        console.log(a.length)

  

请求是比每秒查几个快了很多 发现到了150个 就开始报禁止IP ,应该是IP太少了,不过比一条一条差感觉快很多,量少推荐,
前端拿到数据 在写一个前端的导出 数据直接导出来 直接就可以用了

 

posted @ 2018-09-11 15:39  V黑匣子  阅读(223)  评论(0编辑  收藏  举报