博客园提升阅读量
博客园提升阅读量
获取免费代理
随机获取几页的代理(一页12个),返回代理列表,page_nub用来控制爬取的代理页数。
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
# 获取免费代理
def get_proxy(page_nub):
page = random.randint(1, 7000)
proxy_list = []
for p in range(page, page + int(page_nub)):
url = 'https://www.kuaidaili.com/free/inha/' + str(p)
res = requests.get(url=url, headers=header)
if res.status_code == 200:
# print(res.text)
print(f'网页可访问: {url}')
ip_list = re.findall("\"ip\": \"(.*?)\",", string=res.text)
port_list = re.findall("\"port\": \"(.*?)\",", string=res.text)
if len(ip_list) == len(port_list):
for i in range(len(ip_list)):
proxy_list.append(f'{ip_list[i]}: {port_list[i]}')
else:
print("异常:ip_list != port_list")
continue
else:
print(f'网页不可访问: {url}')
continue
time.sleep(1)
print(f'= = = = = = = = = = 共计收集 {len(proxy_list)} 个代理')
return proxy_list
获取本人所有博客id
根据博客标识获取本人所有博客id,返回为博客id列表,user参数为博客标识,比如:https://www.cnblogs.com/test-gang ,则标识为:test-gang
# 获取博客所有文章id
def blogs_list(user):
blog_list = []
read = 0
for i in range(1, 1000):
url = f'https://www.cnblogs.com/{user}?page={i}'
res = requests.get(url)
blogs = re.findall(f'<a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/{user}/p/(.*?)">', string=res.text)
read_list = re.findall(f'<span data-post-id=".*" class="post-view-count">阅读\((.*?)\)</span>', string=res.text)
if not blogs:
print(f'当前所有文章总阅读量为: {read}')
return blog_list
blog_list += blogs
read += sum([int(item) for item in read_list])
获取本人id
这个不太好解释,反正对最后一步有用,也是根据博客标识来获取
# 获取本人id
def current_blog_id(blogs):
url = f"https://www.cnblogs.com/{blogs}"
res = requests.get(url)
if res.status_code == 200:
return re.search('var currentBlogId = (.*?);', res.text).group(1)
else:
print(f'current_blog_id({blogs}), 错误!!!')
刷访问量
根据本人id、博客id列表、代理列表进行刷访问量
# 访问博客
def put_blog(proxies, blogs, blogs_id):
start_time = time.time()
success = 0
user_agent = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/93.0.4577.63 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Android 10; Mobile; rv:91.0) Gecko/91.0 Firefox/91.0',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/37.0 Mobile/15E148 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.7.1 EdgiOS/46.12.4 Mobile/15E148 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
]
for i in proxies:
header_blog = {
'User-Agent': random.choice(user_agent)
}
url = f"https://count.cnblogs.com/blog/post/{blogs_id}_{random.choice(blogs)}"
proxy = {'http:': i}
res = requests.put(url, proxies=proxy, timeout=5, headers=header_blog)
if res.status_code == 200:
success += 1
print(f'访问成功: {i} => {url}, User-Agent: {header_blog.get("User-Agent")}')
else:
print(f'访问失败: {i}')
time.sleep(random.randint(0, 5) + (random.randint(0, 9)) / 10)
print(f'= = = = = = = = = = 用时 {time.time() - start_time} 秒')
print(f'= = = = = = = = = = 共计成功访问 {success} 次')
完整代码
import random
import re
import time
import requests
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
# 获取免费代理
def get_proxy(page_nub):
page = random.randint(1, 7000)
proxy_list = []
for p in range(page, page + int(page_nub)):
url = 'https://www.kuaidaili.com/free/inha/' + str(p)
res = requests.get(url=url, headers=header)
if res.status_code == 200:
# print(res.text)
print(f'网页可访问: {url}')
ip_list = re.findall("\"ip\": \"(.*?)\",", string=res.text)
port_list = re.findall("\"port\": \"(.*?)\",", string=res.text)
if len(ip_list) == len(port_list):
for i in range(len(ip_list)):
proxy_list.append(f'{ip_list[i]}: {port_list[i]}')
else:
print("异常:ip_list != port_list")
continue
else:
print(f'网页不可访问: {url}')
continue
time.sleep(1)
print(f'= = = = = = = = = = 共计收集 {len(proxy_list)} 个代理')
return proxy_list
# 获取博客所有文章id
def blogs_list(user):
blog_list = []
read = 0
for i in range(1, 1000):
url = f'https://www.cnblogs.com/{user}?page={i}'
res = requests.get(url)
blogs = re.findall(f'<a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/{user}/p/(.*?)">', string=res.text)
read_list = re.findall(f'<span data-post-id=".*" class="post-view-count">阅读\((.*?)\)</span>', string=res.text)
if not blogs:
print(f'当前所有文章总阅读量为: {read}')
return blog_list
blog_list += blogs
read += sum([int(item) for item in read_list])
def current_blog_id(blogs):
url = f"https://www.cnblogs.com/{blogs}"
res = requests.get(url)
if res.status_code == 200:
return re.search('var currentBlogId = (.*?);', res.text).group(1)
else:
print(f'current_blog_id({blogs}), 错误!!!')
# 访问博客
def put_blog(proxies, blogs, blogs_id):
start_time = time.time()
success = 0
user_agent = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/93.0.4577.63 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Firefox/91.0',
'Mozilla/5.0 (Android 10; Mobile; rv:91.0) Gecko/91.0 Firefox/91.0',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/37.0 Mobile/15E148 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.7.1 EdgiOS/46.12.4 Mobile/15E148 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
]
for i in proxies:
header_blog = {
'User-Agent': random.choice(user_agent)
}
url = f"https://count.cnblogs.com/blog/post/{blogs_id}_{random.choice(blogs)}"
proxy = {'http:': i}
res = requests.put(url, proxies=proxy, timeout=5, headers=header_blog)
if res.status_code == 200:
success += 1
print(f'访问成功: {i} => {url}, User-Agent: {header_blog.get("User-Agent")}')
else:
print(f'访问失败: {i}')
time.sleep(random.randint(0, 5) + (random.randint(0, 9)) / 10)
print(f'= = = = = = = = = = 用时 {time.time() - start_time} 秒')
print(f'= = = = = = = = = = 共计成功访问 {success} 次')
blog_user = input('请输入博客园用户标识:\n例如,https://www.cnblogs.com/test-gang, 则输入 test-gang\n')
blog_id = current_blog_id(blog_user)
blog = blogs_list(blog_user)
while True:
put_blog(proxies=get_proxy(1), blogs=blog, blogs_id=blog_id)
说明
因为博客园自身的防刷机制,每5分钟阅读量最多+1(根据代码执行效果猜测),所以效率还是很低的。
免责声明
所分享资料目的只用于探讨,切勿使用资料中的技术进行违法活动,读者利用资料中的相关技术进行违法活动,造成的后果与作者本人无关。