【反爬】之云锁服务器安全反爬

什么是云锁?

云锁其实是个服务器安全软件,主业也不是反爬虫,不过有一条是可以防止cc攻击,而爬虫行为就像是频次不高的cc攻击,因而直接请求目标站并不能返回目标内容。

云锁如何反爬虫?

(1)封禁高频IP
(2)放个cookie

 

# -*- coding: UTF-8 -*-

import os
import sys
from spiders.market_supervision_penalty.govement_penalty_base_spider import govement_penalty_base_spider
from utils.common_util import *
import datetime
import time
from bs4 import BeautifulSoup
from spiders.base_spiders.base_spider import *
from urllib.parse import urlencode
from config.proxy.config import *
from utils.date_util import current_datetime


class nmg_market_gov_hlbe(govement_penalty_base_spider):
    """
    网站地址栏地址:http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/
    author: wuck
    """

    name = "nmg_market_gov_hlbe"

    is_not_change_proxy = True  # 只用一个代理
    is_proxy = True
    proxy_type = PROXY_TYPE_WD
    proxy_count = 50

    def __init__(self, increment=None, *args, **kwargs):
        super(nmg_market_gov_hlbe, self).__init__(*args, **kwargs)

        self.increment = increment

        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'scjdglj.hlbe.gov.cn',
            'Referer': 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        }


    def start_requests(self):
        index_url = "http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/"
        yield scrapy.Request(url=index_url, method='GET', headers=self.headers,
                             encoding="utf-8", dont_filter=True)

    def parse(self, response):
        resp_url = response.url
        resp_meta = copy.deepcopy(response.meta)
        try:
            resp_js = '''
            var screen = {
                width : 1920,
                height: 1080
            }

            var cookie = null;
            var location = null;
            var window = {
                location: {
                    href: 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/'
                }
            }

            function stringToHex(str) {
                var val = "";
                for (var i = 0; i < str.length; i++) {
                    if (val == "") val = str.charCodeAt(i).toString(16); else val += str.charCodeAt(i).toString(16);
                }
                return val;
            }



            function YunSuoAutoJump() {
                var width = screen.width;
                var height = screen.height;
                var screendate = width + "," + height;
                var curlocation = window.location.href;
                if (-1 == curlocation.indexOf("_security_verify_")) {
                    cookie = "srcurl=" + stringToHex(window.location.href) + ";path=/;";
                }
                location = stringToHex(screendate)
                return [location , cookie]
            }

                        '''
            scurl = pyv8_engine_service(resp_js, functionName='YunSuoAutoJump').split(',')[1].split(';')[0]
            header = deepCopy(self.headers)
            cookie = response.headers["Set-Cookie"].decode().split(";")[0]
            header["cookie"] = cookie + ';' + scurl
            cookie_url = 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/?security_verify_data=313932302c31303830'
            yield scrapy.Request(url=cookie_url, method='GET', headers=header,meta={**resp_meta,'cookie':cookie},
                                 encoding="utf-8", dont_filter=True,callback=self.parse_scurl)
        except:
            traceback.print_exc()
            self.logger.info(f"parse error url: {resp_url}")

    def parse_scurl(self, response):
        resp_url = response.url
        resp_meta = copy.deepcopy(response.meta)
        try:
            cookie_str, coolie_dict = getSetcookie2Str(response)
            if "security_session_mid_verify" in cookie_str:
                header = deepCopy(self.headers)
                header["cookie"] = cookie_str
                if "list"  in str(resp_meta) and "detail" not in str(resp_meta):
                    yield scrapy.Request(url=resp_meta['list'], method='GET',headers=header,
                                         encoding="utf-8", dont_filter=True, callback=self.parse_list, meta=resp_meta)
                elif "detail" in str(resp_meta):
                    yield scrapy.Request(url=resp_meta['detail'], method='GET',headers=header,
                                         encoding="utf-8", dont_filter=True, callback=self.parse_detail, meta=resp_meta)
                else:
                    yield scrapy.Request(url='http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/', method='GET', headers=header,
                                         encoding="utf-8", dont_filter=True, callback=self.parse_number,meta=resp_meta)
            else:
                header = deepCopy(self.headers)
                header["cookie"] = resp_meta['cookie']
                yield scrapy.Request(url=resp_url, method='GET', headers=self.headers,meta=resp_meta,
                                     encoding="utf-8", dont_filter=True, callback=self.parse_scurl)
        except:
            traceback.print_exc()
            self.logger.info(f"parse error url: {resp_url}")

    def parse_number(self, response):
        resp_url = response.url
        resp_meta = copy.deepcopy(response.meta)
        try:
            resp_soup = BeautifulSoup(response.text, 'html5lib')
            search_number = 2 if self.increment else 2
            for index in range(1, search_number + 1):
                if index > 1:
                    send_url = 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/{}/'.format(index)
                    yield scrapy.Request(url=send_url, method='GET', headers=self.headers,meta=resp_meta,
                                         encoding="utf-8", dont_filter=True, callback=self.parse_list)
                else:
                    yield scrapy.Request(url=resp_url, method='GET', headers=self.headers,meta=resp_meta,
                                         encoding="utf-8", dont_filter=True, callback=self.parse_list)
        except:
            traceback.print_exc()
            self.logger.info(f"parse error url: {resp_url}")

    def parse_list(self, response):
        resp_url = response.url
        resp_meta = copy.deepcopy(response.meta)
        try:
            resp_soup = BeautifulSoup(response.text, 'html5lib')
            if "security_verify_" not in response.text:
                detail_list = resp_soup.select('div.w670 li')[1:-5]
                for detail in detail_list:
                    if "href" in str(detail):
                        detail_url = response.urljoin(detail.select_one('a')['href'])
                        pub_time = detail.select('div')[-1].text.strip()
                        docno = detail.select_one('div').text
                        fileno = detail.select('div')[1].text
                        yield scrapy.Request(url=detail_url, method='GET', headers=self.headers,meta={**resp_meta,'pub_time':pub_time,'docno':docno,'fileno':fileno},
                                             encoding="utf-8", dont_filter=True, callback=self.parse_detail)
            else:
                yield scrapy.Request(url=resp_url, method='GET', headers=self.headers, meta={**resp_meta,"list":resp_url},
                                     encoding="utf-8", dont_filter=True, callback=self.parse)
        except:
            traceback.print_exc()
            self.logger.info(f"parse error url: {resp_url}")

    def parse_detail(self, response):
        resp_url = response.url
        resp_meta = copy.deepcopy(response.meta)
        try:
            resp_body = response.text
            if "security_verify_" not in resp_body:
                resp_soup = BeautifulSoup(resp_body, 'html5lib')
                info = dict()
                info["Title"] = resp_soup.select_one('div.title').text
                source = resp_soup.select_one('div.item').text.replace('\n','').replace('\u3000','').strip()
                if "来源" in source:
                    info['MessageSource'] = re.findall(r'来源:(.*?)日期',source)[0].strip()
                info['索引号'] = resp_meta['docno']
                info['文号'] = resp_meta['fileno']
                info["middle_table"] = True
                info['PublishTime'] = re.findall(r'日期:(.*?)人气',source)[0].replace('/','-')
                info["Content"] = str(resp_soup.select_one('div.content'))
                info["ResponseBodyHtml"] = resp_body
                info["SourceUrl"] = resp_url
                info["Website"] = '呼伦贝尔市市场监督管理局'
                info["KeyNo"] = md5encode(info["Title"] + "|" + info["Website"] + "|" + info["Content"])
                yield info
            else:
                yield scrapy.Request(url=resp_url, method='GET', headers=self.headers, meta={**resp_meta,"detail":resp_url},
                                     encoding="utf-8", dont_filter=True, callback=self.parse)
        except:
            traceback.print_exc()
            self.logger.info(f"parse error url: {resp_url}")

 

posted @ 2021-10-14 14:00  Eliphaz  阅读(138)  评论(0编辑  收藏  举报