给女朋友写的一个利用搜索引擎爬取会议论文的脚本

import bs4, requests, os
from multiprocessing import Manager, Pool

#红色：报错
def R(message):
    return "\033[1;91m{}\033[0;m".format(message)
#绿色：成功
def G(message):
    return "\033[1;92m{}\033[0;m".format(message)

def B(message):
    return "\033[1;94m{}\033[0;m".format(message)

url_dict = Manager().dict()
key_list = ["On the TOCTOU Problem in Remote Attestation", "Search-based Approaches for Local Black-Box Code Deobfuscation: Understand, Improve and Mitigate", "Exorcising Spectres with Secure Compilers"]
fakeua = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
search_depth = 10 #搜索前10个链接
thread_num = 8 #线程数

def search_page(search_list):
	print(B('[*]Starting search page now......'))
	for keywords in search_list:
		# 进行必应搜索并下载搜索页面
		searchPage = requests.get('https://cn.bing.com/search?q=' + keywords, headers = fakeua)
		searchPage.raise_for_status()    # 如果失败就抛出异常

		# 得到搜索结果的链接
		searchSoup = bs4.BeautifulSoup(searchPage.text, features="html.parser")
		elements = searchSoup.select('.sh_favicon')

		# 在浏览器中打开前search_depth个连接
		get_url = False
		for i in range(min(search_depth, len(elements))):
			href = elements[i].get('href')
			try:
				if "pdf" in href:
					url_dict[keywords] = href
					get_url = True
					break
			except:
				pass

		if get_url:
			print(G('[+] Get download href "%s" for paper "%s"' % (href, keywords)))
		else:
			print(R('[-] Cannot get download href for paper "%s"' % (keywords)))

def download_page(search_list):
	print(B('[*]Starting download page now......'))
	for key in search_list:
		if key not in url_dict:
			break
		url = url_dict[key]
		try:
			key = filter_key(key)
			data = requests.get(url, headers=fakeua, stream=True, timeout=30)
			result_dir = os.path.join(os.getcwd(), '\\result')
			if not os.path.exists(result_dir):
				os.mkdir(result_dir)
			page_path = os.path.join(result_dir, '%s.pdf' % key)
			with open(page_path, 'wb') as fp:
					fp.write(data.content)
			print(G('[+] Successfully download page "%s.pdf"' % (key)))
		except:
			print(G('[-] Failed download page "%s.pdf"' % (key)))
			pass

#替换掉名字中的特殊字符
def filter_key(key):
	sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
	for char in key:
		if char in sets:
			key = key.replace(char, '')
	return key

def gen_result_txt():
	path = os.path.join(os.getcwd(), 'result.txt')
	with open(path, 'w+') as fp:
		for key, value in url_dict.items():
			line = key + ' ==> ' + value + '\n'
			fp.write(line)
	print(G('[+] Successfully gen reulst text in: %s' % (path)))

if __name__ == '__main__':
	#分发线程任务
	thread_num = min(thread_num, len(key_list))
	each_len = len(key_list) // thread_num
	search_list = []
	for i in range(thread_num):
		if i == thread_num - 1:
			search_list.append(key_list[i*each_len:])
		else:
			search_list.append(key_list[i * each_len: i * each_len + each_len])

	print (search_list)
	pool = Pool(processes=thread_num)
	pool.map(search_page, search_list)
	pool.join()  # 主进程阻塞等待子进程的退出
	gen_result_txt()
	print (url_dict)
	pool = Pool(processes=thread_num)
	pool.map(download_page, search_list)

posted @ 2022-10-04 00:13 z5onk0 阅读(63) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· pdf教程爬取

· exchange邮件爬虫

· 爬取大学论文

· 多线程爬虫

· Python爬虫开发

阅读排行：
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布：重大改进与新特性概览！
· AI与.NET技术实操系列（二）：开始使用ML.NET
· 单线程的Redis速度为什么快？
· Pantheons：用 TypeScript 打造主流大模型对话的一站式集成库

公告

昵称： z5onk0
园龄： 2年5个月
粉丝： 27
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

z5onk0

给女朋友写的一个利用搜索引擎爬取会议论文的脚本

公告

搜索

常用链接

随笔分类

随笔档案

相册

阅读排行榜

评论排行榜

推荐排行榜

最新评论