爬虫小技巧

爬虫小技巧

利用pycharm给字符(请求头)加引号

  • 复制需要加引号的请求头

  • 粘贴到pycharm中操作

chrome F12调试网页出现Paused in debugger解决办法

  • 解决办法

电脑手动更换代理

# 有时候写爬虫分析网页,对方网站可能对ip做封禁处理,导致在浏览器无法拿到网页数据去分析.
# 这种情况我们就可以手动更换代理,然后就可以继续分析网页

# 代理网站(自己搜,我这里推荐几个)
- 免费代理:
    - 全网代理IP 	www.goubanjia.com 
    - 快代理     	https://www.kuaidaili.com/
    - 西祠代理   	https://www.xicidaili.com/nn/
    - 代理精灵   	http://http.zhiliandaili.cn/

构建UA请求池 并使用

import random
class Spider(object):
    def __init__(self):
        self.user_agent = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
        ]
        self.headers = {}
    def run(self):
        self.headers['User-Agent'] = random.choice(self.user_agent)
        print(self.headers)

obj = Spider()
obj.run()

提示程序耗时

import time
import random
import datetime

def func():
    time.sleep(random.randint(1, 5))
    return None

if __name__ == '__main__':
    st = datetime.datetime.now()
    print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
    func()
    et = datetime.datetime.now()
    print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))

爬虫请求重要参数

import requests
import traceback

session = requests.session()
# 请求头的ua
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
# 请求参数
data = {
    "ajaxAction": True
}
# 代理
proxies = {'http': '127.0.0.1:7891', 'https': '127.0.0.1:7891'}


def spider():
    try:
        response = session.get(url='www.xxx.com', headers=headers, data=data, proxies=proxies,
                               timeout=8)  # timeout=8请求超时时间 单位 秒
        if response.status_code == '200':
            return response
        return None
    except Exception as e:
        msg = '响应异常,未获取数据!异常信息:{}'.format(traceback.format_exc())
        print(msg)
        return None

字符串日期加一天

import datetime

time_str = '2020-08-01'
time_str.split('-')[0].strip()
date = datetime.datetime(int(time_str.split('-')[0].strip()), int(time_str.split('-')[1].strip()),
                         int(time_str.split('-')[2].strip()))
# print(date)  2020-08-01 00:00:00
number = datetime.timedelta(days=1)
new_date = date + number
expect_time = new_date.strftime('%Y-%m-%d')
# print(expect_time) 2020-08-02

线程池与间隔固定时间执行程序

import time
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from spider import MM  # 爬虫类
from spider.proxy import get_proxyList  # 代理


def get_task_information():
    """获取信息查询任务"""
    try:
        url = 'www.xxx.com'
        rep = requests.get(url).json()
        return rep
    except Exception as e:
        print(e)
        return None


def save_function(data):
    """回调函数用于保存数据"""
    try:
        data = data.result()
        if data:
            url = 'www.xoo.com'
            rep = requests.post(url, data=data).text
            print(rep, '保存数据完成!!!!!!!!!!!!!!!!!')
        else:
            return None
    except Exception as e:
        print(e)
        return None


def execute_function(task):
    """调用函数"""
    try:
        obj = MM()  # 自己写的爬虫类
        data = obj.run(obj.delay_info, task)
        return data
    except Exception as e:
        print(e)
        return None


def main_function():
    """主程序"""
    try:
        response = get_task_information()
        if response.get('result'):
            task_list = [{'task': i} for i in response.get('result')]
            with ThreadPoolExecutor(max_workers=30) as tp:
                for task in task_list:
                    tp.submit(execute_function, task).add_done_callback(save_function)
                    as_completed(tp)
            return '任务处理完成!!!'
    except Exception as e:
        print(e)
        return None


t = threading.Thread(target=get_proxyList)
t.start()
while True:
    if not t.isAlive():
        t.start()
    try:
        st = datetime.datetime.now()
        print('{}任务开始!!!!!!'.format(st.strftime('%Y-%m-%d %H:%M:%S')))
        main_function()
        et = datetime.datetime.now()
        print('{}任务结束!!!!!!耗时{}'.format(et.strftime('%Y-%m-%d %H:%M:%S'), et - st))
        time.sleep(300)
    except BaseException as e:
        print(e)

url携带参数转字典

from urllib import parse
def qs(data):
    # 解析URL携带参数,生成字典
    query = parse.urlparse(data).query
    return dict([(k, v[0] if len(v) == 1 else v) for k, v in parse.parse_qs(query).items()])
url_data = """?cc=1&ck=1&cl=24-bit&ds=1920x1080&vl=1041&et=0&ja=0&ln=zh-cn&lo=0&lt=1628728417&ck=2"""
print(qs(url_data))
# 运行结果
{'cc': '1', 'ck': ['1', '2'], 'cl': '24-bit', 'ds': '1920x1080', 'vl': '1041', 'et': '0', 'ja': '0', 'ln': 'zh-cn', 'lo': '0', 'lt': '1628728417'}
作 者:郭楷丰
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角 推荐一下。您的鼓励是博主的最大动力!
自 勉:生活,需要追求;梦想,需要坚持;生命,需要珍惜;但人生的路上,更需要坚强。带着感恩的心启程,学会爱,爱父母,爱自己,爱朋友,爱他人。
posted @ 2019-11-14 19:10  郭楷丰  阅读(483)  评论(0编辑  收藏  举报
Live2D