Nginx支持反爬虫并限制客户端的请求的并发数
cat /usr/local/nginx/conf/agent_deny.conf
if ($http_user_agent ~* "qihoobot|Baiduspider|Googlebot|Googlebot-Mobile|Googlebot-Image|Mediapartners-Google|Adsbot-Google|Feedfetcher-Google|Yahoo! Slurp|Yahoo! Slurp China|YoudaoBot|Sosospider|Sogou spider|Sogou web spider|MSNBot|ia_archiver|Tomato Bot|Catall Spider|AcoiRobot") {
return 403;
}
if ($http_user_agent ~ "WinHttp|WebZIP|FetchURL|node-superagent|java/|FeedDemon|Jullo|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|Java|Feedly|Apache-HttpAsyncClient|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|heritrix|EasouSpider|Ezooms|BOT/0.1|YandexBot|FlightDeckReports|Linguee Bot|iaskspider^$") {
return 403;
}
if ($request_method !~ ^(GET|HEAD|POST)$) {
return 403;
}
if ($http_user_agent ~* (Python|Java|Wget|Scrapy|Curl|HttpClient|Spider)) {
return 403;
}
#屏蔽单个IP的命令是
#deny 123.45.6.7
#封整个段即从123.0.0.1到123.255.255.254的命令
#deny 123.0.0.0/8
#封IP段即从123.45.0.1到123.45.255.254的命令
#deny 124.45.0.0/16
#封IP段即从123.45.6.1到123.45.6.254的命令是
#deny 123.45.6.0/24
以下IP皆为流氓
deny 58.95.66.0/24;
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
注释:
一般情况下是允许百度爬虫和谷歌爬虫来爬取网站的内容的,例如网站官网的首页等,所以百度的爬虫和谷歌的爬虫是可以放开,允许来爬取网站内容的。
此文件agent_deny.conf 包含到网站官网的server虚拟主机里面的。
以下的nginx配置文件是方向代理负载均衡的配置文件:
server {
listen 80;
server_name pk.tltest.com static.tltest.com;
access_log /home/wwwlogs/access.log main;
## 这个就是反爬虫文件
include /usr/local/nginx/conf/agent_deny.conf;
location / {
limit_req zone=reqip burst=200 nodelay;
proxy_cache cache_one;
proxy_cache_valid 200 304 301 302 99s;
proxy_cache_valid any 1s;
proxy_redirect off;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header REMOTE-HOST $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Connection "";
proxy_http_version 1.1;
proxy_next_upstream off;
proxy_ignore_client_abort on;
proxy_ignore_headers Set-Cookie Cache-Control;
client_max_body_size 30m;
client_body_buffer_size 256k;
proxy_connect_timeout 75;
proxy_send_timeout 300;
proxy_read_timeout 300;
proxy_buffer_size 1m;
proxy_buffers 8 512k;
proxy_busy_buffers_size 2m;
proxy_temp_file_write_size 2m;
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503;
proxy_max_temp_file_size 128m;
proxy_pass http://backend;
}
location *\.(php|python)$ {
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_pass http://backend;
}
####nginx前端限制客户端对网站某个目录的请求搜索的并发数
location = /novel/search {
limit_conn conip 2;
limit_req zone=reqip burst=3 nodelay;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_pass http://backend;
#access_log /home/wwwlogs/search.log main;
}
####nginx前端限制客户端对网站某个目录的文件内容请求下载的并发数
location = /novel/read/cache {
limit_conn conip 1;
limit_req zone=reqip burst=2 nodelay;
limit_rate 512k;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_pass http://backend;
#access_log /home/wwwlogs/download.log main;
}
####nginx前端限制客户端对网站某个目录的文件下apk下载的并发数
location = /novel/read/content {
limit_conn conip 5;
limit_req zone=reqip burst=10 nodelay;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_pass http://backend;
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.