【2022.3.20】给女朋友写的一个利用搜索引擎爬取会议论文的脚本,用的BeautifulSoup解析网页,第一次接触爬虫,不太会......
import bs4, requests, os
from multiprocessing import Manager, Pool
def R(message):
return "\033[1;91m{}\033[0;m".format(message)
def G(message):
return "\033[1;92m{}\033[0;m".format(message)
def B(message):
return "\033[1;94m{}\033[0;m".format(message)
url_dict = Manager().dict()
key_list = ["On the TOCTOU Problem in Remote Attestation", "Search-based Approaches for Local Black-Box Code Deobfuscation: Understand, Improve and Mitigate", "Exorcising Spectres with Secure Compilers"]
fakeua = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
search_depth = 10
thread_num = 8
def search_page(search_list):
print(B('[*]Starting search page now......'))
for keywords in search_list:
searchPage = requests.get('https://cn.bing.com/search?q=' + keywords, headers = fakeua)
searchPage.raise_for_status()
searchSoup = bs4.BeautifulSoup(searchPage.text, features="html.parser")
elements = searchSoup.select('.sh_favicon')
get_url = False
for i in range(min(search_depth, len(elements))):
href = elements[i].get('href')
try:
if "pdf" in href:
url_dict[keywords] = href
get_url = True
break
except:
pass
if get_url:
print(G('[+] Get download href "%s" for paper "%s"' % (href, keywords)))
else:
print(R('[-] Cannot get download href for paper "%s"' % (keywords)))
def download_page(search_list):
print(B('[*]Starting download page now......'))
for key in search_list:
if key not in url_dict:
break
url = url_dict[key]
try:
key = filter_key(key)
data = requests.get(url, headers=fakeua, stream=True, timeout=30)
result_dir = os.path.join(os.getcwd(), '\\result')
if not os.path.exists(result_dir):
os.mkdir(result_dir)
page_path = os.path.join(result_dir, '%s.pdf' % key)
with open(page_path, 'wb') as fp:
fp.write(data.content)
print(G('[+] Successfully download page "%s.pdf"' % (key)))
except:
print(G('[-] Failed download page "%s.pdf"' % (key)))
pass
def filter_key(key):
sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for char in key:
if char in sets:
key = key.replace(char, '')
return key
def gen_result_txt():
path = os.path.join(os.getcwd(), 'result.txt')
with open(path, 'w+') as fp:
for key, value in url_dict.items():
line = key + ' ==> ' + value + '\n'
fp.write(line)
print(G('[+] Successfully gen reulst text in: %s' % (path)))
if __name__ == '__main__':
thread_num = min(thread_num, len(key_list))
each_len = len(key_list) // thread_num
search_list = []
for i in range(thread_num):
if i == thread_num - 1:
search_list.append(key_list[i*each_len:])
else:
search_list.append(key_list[i * each_len: i * each_len + each_len])
print (search_list)
pool = Pool(processes=thread_num)
pool.map(search_page, search_list)
pool.join()
gen_result_txt()
print (url_dict)
pool = Pool(processes=thread_num)
pool.map(download_page, search_list)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库