参考文章:

  https://andyliwr.github.io/2017/12/05/nodejs_spider_ip/

  https://segmentfault.com/q/1010000008196143

 

代码:

import request from 'request';
import userAgents from './common/userAgent';
import Promise from 'bluebird';

//这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存
const expiryTime = 10 * 60 * 1000;// 过期间隔时间,毫秒
let ips = null; //代理ip
let time = null;// 存储代理IP的时间,判断是否过期,如果过期重新请求

/**
 * 请求免费代理,可做缓存,这里就存在变量中,只做测试
 */
const getProxyList = () => {
    return new Promise((resolve, reject) => {
            const nowDate = Date.now();
            if( nowDate - time <  expiryTime ){
                resolve(ips);
                return;
            }
            const apiURL = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1';
        const options = { method: 'GET', url: apiURL, gzip: true, encoding: null,
            headers: {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                'User-Agent': 'Mozilla/8.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
                'referer': 'http://www.66ip.cn/'
            },
        };
        request(options, (error, response, body)=>{
            try {
                     if(Buffer.isBuffer(body)){
                         const ret = body.toString().match(/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}/g);
                         ips = ret;
                         time = Date.now();
                         resolve(ret);
                     }
            } catch (e) {
                console.log(e);
            }
        });
    })
}
//爬取网页
async function reptile(ipList){
    return new Promise((resolve, reject) => {
        let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
        let ip = ipList[parseInt(Math.random() * ips.length)];
        let useIp = `http://${ip}`;
        const options = { method: 'GET', url: 'http://www.qcnh1920.com', gzip: true, encoding: null,
            headers: {
                'User-Agent': userAgent, //动态设置浏览器头部信息
            },
            proxy: useIp, //动态设置代理ip
            timeout: 8000
        };
        request( options , (error, response, body)=>{
            //这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问
            if (error) {
                console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`);
                // 如果是代理ip无法访问,另外选择一个代理
            }else{
                console.log('爬取页面成功,  √');
            }
            resolve(body)
        })
    });
}
//启动方法
async function startFun (){
    const ipList = await getProxyList();//获取代理ip
    const body = await reptile(ipList);//爬取网页
    if(!body){
        startFun();
        return;
    }
    //解析html
    console.log(body.toString());
}
//启动方法
startFun();

 

 

userAgent.js
  
const userAgents = [
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
  'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
  'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
  'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
  'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
]

export default userAgents;

 


posted on 2018-07-15 23:24  浅唱年华1920  阅读(3381)  评论(0编辑  收藏  举报