Nginx支持反爬虫并限制客户端的请求的并发数

cat /usr/local/nginx/conf/agent_deny.conf

if ($http_user_agent ~* "qihoobot|Baiduspider|Googlebot|Googlebot-Mobile|Googlebot-Image|Mediapartners-Google|Adsbot-Google|Feedfetcher-Google|Yahoo! Slurp|Yahoo! Slurp China|YoudaoBot|Sosospider|Sogou spider|Sogou web spider|MSNBot|ia_archiver|Tomato Bot|Catall Spider|AcoiRobot") { 
    return 403;
}

if ($http_user_agent ~ "WinHttp|WebZIP|FetchURL|node-superagent|java/|FeedDemon|Jullo|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|Java|Feedly|Apache-HttpAsyncClient|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|heritrix|EasouSpider|Ezooms|BOT/0.1|YandexBot|FlightDeckReports|Linguee Bot|iaskspider^$") {
    return 403;             
}

if ($request_method !~ ^(GET|HEAD|POST)$) {
    return 403;
}

if ($http_user_agent ~* (Python|Java|Wget|Scrapy|Curl|HttpClient|Spider)) {
    return 403;
}

#屏蔽单个IP的命令是
#deny 123.45.6.7
#封整个段即从123.0.0.1到123.255.255.254的命令
#deny 123.0.0.0/8
#封IP段即从123.45.0.1到123.45.255.254的命令
#deny 124.45.0.0/16
#封IP段即从123.45.6.1到123.45.6.254的命令是
#deny 123.45.6.0/24
以下IP皆为流氓
deny 58.95.66.0/24;
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.

注释:

一般情况下是允许百度爬虫和谷歌爬虫来爬取网站的内容的,例如网站官网的首页等,所以百度的爬虫和谷歌的爬虫是可以放开,允许来爬取网站内容的。
此文件agent_deny.conf 包含到网站官网的server虚拟主机里面的。

以下的nginx配置文件是方向代理负载均衡的配置文件:

server {
    listen       80;
    server_name  pk.tltest.com static.tltest.com; 
access_log   /home/wwwlogs/access.log  main;    
 
 ## 这个就是反爬虫文件
    include /usr/local/nginx/conf/agent_deny.conf;    
    location / {
       limit_req zone=reqip burst=200 nodelay;
       proxy_cache cache_one;
       proxy_cache_valid  200 304 301 302 99s;
       proxy_cache_valid any 1s;
       proxy_redirect off;
       proxy_set_header Host $host;
       proxy_set_header X-Real-IP $remote_addr;
       proxy_set_header REMOTE-HOST $remote_addr;
       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
       proxy_set_header Connection "";
       proxy_http_version 1.1;
       proxy_next_upstream off;
       proxy_ignore_client_abort on;
       proxy_ignore_headers Set-Cookie Cache-Control;
       client_max_body_size 30m;
       client_body_buffer_size 256k;
       proxy_connect_timeout 75;
       proxy_send_timeout 300;
       proxy_read_timeout 300;
       proxy_buffer_size 1m;
       proxy_buffers 8 512k;
       proxy_busy_buffers_size 2m;
       proxy_temp_file_write_size 2m;
       proxy_next_upstream error timeout invalid_header http_500 http_502 http_503;
       proxy_max_temp_file_size 128m;
       proxy_pass http://backend;
    }
    
    location *\.(php|python)$ {
       proxy_set_header Host $host; 
       proxy_set_header X-Forwarded-For  $remote_addr;
       proxy_pass http://backend;
    } 
    
	####nginx前端限制客户端对网站某个目录的请求搜索的并发数
    location = /novel/search {
       limit_conn conip 2;
       limit_req zone=reqip burst=3 nodelay;
       proxy_set_header Host $host;
       proxy_set_header X-Forwarded-For  $remote_addr;
       proxy_pass http://backend;
       #access_log /home/wwwlogs/search.log  main;
    }
 
    ####nginx前端限制客户端对网站某个目录的文件内容请求下载的并发数
    location = /novel/read/cache {
       limit_conn conip 1;
       limit_req zone=reqip burst=2 nodelay;
       limit_rate 512k;
       proxy_set_header Host $host;
       proxy_set_header X-Forwarded-For  $remote_addr;
       proxy_pass http://backend;
       #access_log /home/wwwlogs/download.log  main;
    }
    ####nginx前端限制客户端对网站某个目录的文件下apk下载的并发数
    location = /novel/read/content {
       limit_conn conip 5;
       limit_req zone=reqip burst=10 nodelay;
       proxy_set_header Host $host;
       proxy_set_header X-Forwarded-For  $remote_addr;
       proxy_pass http://backend;
    }    

}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.

参考文档:
 https://www.centos.bz/2018/01/nginx支持https并且支持反爬虫/

posted @ 2018-03-03 13:16  勤奋的蓝猫  阅读(12)  评论(0编辑  收藏  举报  来源