博客园提升阅读量

博客园提升阅读量

获取免费代理

随机获取几页的代理(一页12个),返回代理列表,page_nub用来控制爬取的代理页数。

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}

# 获取免费代理
def get_proxy(page_nub):
    page = random.randint(1, 7000)
    proxy_list = []
    for p in range(page, page + int(page_nub)):
        url = 'https://www.kuaidaili.com/free/inha/' + str(p)
        res = requests.get(url=url, headers=header)
        if res.status_code == 200:
            # print(res.text)
            print(f'网页可访问: {url}')
            ip_list = re.findall("\"ip\": \"(.*?)\",", string=res.text)
            port_list = re.findall("\"port\": \"(.*?)\",", string=res.text)
            if len(ip_list) == len(port_list):
                for i in range(len(ip_list)):
                    proxy_list.append(f'{ip_list[i]}: {port_list[i]}')
            else:
                print("异常:ip_list != port_list")
                continue
        else:
            print(f'网页不可访问: {url}')
            continue
        time.sleep(1)
    print(f'= = = = = = = = = = 共计收集 {len(proxy_list)} 个代理')
    return proxy_list

获取本人所有博客id

根据博客标识获取本人所有博客id,返回为博客id列表,user参数为博客标识,比如:https://www.cnblogs.com/test-gang ,则标识为:test-gang

# 获取博客所有文章id
def blogs_list(user):
    blog_list = []
    read = 0
    for i in range(1, 1000):
        url = f'https://www.cnblogs.com/{user}?page={i}'
        res = requests.get(url)
        blogs = re.findall(f'<a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/{user}/p/(.*?)">', string=res.text)
        read_list = re.findall(f'<span data-post-id=".*" class="post-view-count">阅读\((.*?)\)</span>', string=res.text)
        if not blogs:
            print(f'当前所有文章总阅读量为: {read}')
            return blog_list
        blog_list += blogs
        read += sum([int(item) for item in read_list])

获取本人id

这个不太好解释,反正对最后一步有用,也是根据博客标识来获取

# 获取本人id
def current_blog_id(blogs):
    url = f"https://www.cnblogs.com/{blogs}"
    res = requests.get(url)
    if res.status_code == 200:
        return re.search('var currentBlogId = (.*?);', res.text).group(1)
    else:
        print(f'current_blog_id({blogs}), 错误!!!')

刷访问量

根据本人id、博客id列表、代理列表进行刷访问量

# 访问博客
def put_blog(proxies, blogs, blogs_id):
    start_time = time.time()
    success = 0
    user_agent = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/93.0.4577.63 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Mozilla/5.0 (Android 10; Mobile; rv:91.0) Gecko/91.0 Firefox/91.0',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/37.0 Mobile/15E148 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.7.1 EdgiOS/46.12.4 Mobile/15E148 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
    ]
    for i in proxies:
        header_blog = {
            'User-Agent': random.choice(user_agent)
        }
        url = f"https://count.cnblogs.com/blog/post/{blogs_id}_{random.choice(blogs)}"
        proxy = {'http:': i}
        res = requests.put(url, proxies=proxy, timeout=5, headers=header_blog)
        if res.status_code == 200:
            success += 1
            print(f'访问成功: {i} => {url}, User-Agent: {header_blog.get("User-Agent")}')
        else:
            print(f'访问失败: {i}')
        time.sleep(random.randint(0, 5) + (random.randint(0, 9)) / 10)
    print(f'= = = = = = = = = = 用时 {time.time() - start_time} 秒')
    print(f'= = = = = = = = = = 共计成功访问 {success} 次')

完整代码

import random
import re
import time
import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}


# 获取免费代理
def get_proxy(page_nub):
    page = random.randint(1, 7000)
    proxy_list = []
    for p in range(page, page + int(page_nub)):
        url = 'https://www.kuaidaili.com/free/inha/' + str(p)
        res = requests.get(url=url, headers=header)
        if res.status_code == 200:
            # print(res.text)
            print(f'网页可访问: {url}')
            ip_list = re.findall("\"ip\": \"(.*?)\",", string=res.text)
            port_list = re.findall("\"port\": \"(.*?)\",", string=res.text)
            if len(ip_list) == len(port_list):
                for i in range(len(ip_list)):
                    proxy_list.append(f'{ip_list[i]}: {port_list[i]}')
            else:
                print("异常:ip_list != port_list")
                continue
        else:
            print(f'网页不可访问: {url}')
            continue
        time.sleep(1)
    print(f'= = = = = = = = = = 共计收集 {len(proxy_list)} 个代理')
    return proxy_list


# 获取博客所有文章id
def blogs_list(user):
    blog_list = []
    read = 0
    for i in range(1, 1000):
        url = f'https://www.cnblogs.com/{user}?page={i}'
        res = requests.get(url)
        blogs = re.findall(f'<a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/{user}/p/(.*?)">', string=res.text)
        read_list = re.findall(f'<span data-post-id=".*" class="post-view-count">阅读\((.*?)\)</span>', string=res.text)
        if not blogs:
            print(f'当前所有文章总阅读量为: {read}')
            return blog_list
        blog_list += blogs
        read += sum([int(item) for item in read_list])

def current_blog_id(blogs):
    url = f"https://www.cnblogs.com/{blogs}"
    res = requests.get(url)
    if res.status_code == 200:
        return re.search('var currentBlogId = (.*?);', res.text).group(1)
    else:
        print(f'current_blog_id({blogs}), 错误!!!')


# 访问博客
def put_blog(proxies, blogs, blogs_id):
    start_time = time.time()
    success = 0
    user_agent = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/93.0.4577.63 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Mozilla/5.0 (Android 10; Mobile; rv:91.0) Gecko/91.0 Firefox/91.0',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/37.0 Mobile/15E148 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36 Edg/93.0.961.47',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.7.1 EdgiOS/46.12.4 Mobile/15E148 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
    ]
    for i in proxies:
        header_blog = {
            'User-Agent': random.choice(user_agent)
        }
        url = f"https://count.cnblogs.com/blog/post/{blogs_id}_{random.choice(blogs)}"
        proxy = {'http:': i}
        res = requests.put(url, proxies=proxy, timeout=5, headers=header_blog)
        if res.status_code == 200:
            success += 1
            print(f'访问成功: {i} => {url}, User-Agent: {header_blog.get("User-Agent")}')
        else:
            print(f'访问失败: {i}')
        time.sleep(random.randint(0, 5) + (random.randint(0, 9)) / 10)
    print(f'= = = = = = = = = = 用时 {time.time() - start_time} 秒')
    print(f'= = = = = = = = = = 共计成功访问 {success} 次')

blog_user = input('请输入博客园用户标识:\n例如,https://www.cnblogs.com/test-gang, 则输入 test-gang\n')
blog_id = current_blog_id(blog_user)
blog = blogs_list(blog_user)

while True:
    put_blog(proxies=get_proxy(1), blogs=blog, blogs_id=blog_id)

说明

因为博客园自身的防刷机制,每5分钟阅读量最多+1(根据代码执行效果猜测),所以效率还是很低的。

免责声明

所分享资料目的只用于探讨,切勿使用资料中的技术进行违法活动,读者利用资料中的相关技术进行违法活动,造成的后果与作者本人无关。

posted @ 2024-05-27 16:33  测试小罡  阅读(2519)  评论(0编辑  收藏  举报