【2022.3.20】给女朋友写的一个利用搜索引擎爬取会议论文的脚本,用的BeautifulSoup解析网页,第一次接触爬虫,不太会......
import bs4, requests, os
from multiprocessing import Manager, Pool
#红色:报错
def R(message):
return "\033[1;91m{}\033[0;m".format(message)
#绿色:成功
def G(message):
return "\033[1;92m{}\033[0;m".format(message)
def B(message):
return "\033[1;94m{}\033[0;m".format(message)
url_dict = Manager().dict()
key_list = ["On the TOCTOU Problem in Remote Attestation", "Search-based Approaches for Local Black-Box Code Deobfuscation: Understand, Improve and Mitigate", "Exorcising Spectres with Secure Compilers"]
fakeua = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
search_depth = 10 #搜索前10个链接
thread_num = 8 #线程数
def search_page(search_list):
print(B('[*]Starting search page now......'))
for keywords in search_list:
# 进行必应搜索并下载搜索页面
searchPage = requests.get('https://cn.bing.com/search?q=' + keywords, headers = fakeua)
searchPage.raise_for_status() # 如果失败就抛出异常
# 得到搜索结果的链接
searchSoup = bs4.BeautifulSoup(searchPage.text, features="html.parser")
elements = searchSoup.select('.sh_favicon')
# 在浏览器中打开前search_depth个连接
get_url = False
for i in range(min(search_depth, len(elements))):
href = elements[i].get('href')
try:
if "pdf" in href:
url_dict[keywords] = href
get_url = True
break
except:
pass
if get_url:
print(G('[+] Get download href "%s" for paper "%s"' % (href, keywords)))
else:
print(R('[-] Cannot get download href for paper "%s"' % (keywords)))
def download_page(search_list):
print(B('[*]Starting download page now......'))
for key in search_list:
if key not in url_dict:
break
url = url_dict[key]
try:
key = filter_key(key)
data = requests.get(url, headers=fakeua, stream=True, timeout=30)
result_dir = os.path.join(os.getcwd(), '\\result')
if not os.path.exists(result_dir):
os.mkdir(result_dir)
page_path = os.path.join(result_dir, '%s.pdf' % key)
with open(page_path, 'wb') as fp:
fp.write(data.content)
print(G('[+] Successfully download page "%s.pdf"' % (key)))
except:
print(G('[-] Failed download page "%s.pdf"' % (key)))
pass
#替换掉名字中的特殊字符
def filter_key(key):
sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for char in key:
if char in sets:
key = key.replace(char, '')
return key
def gen_result_txt():
path = os.path.join(os.getcwd(), 'result.txt')
with open(path, 'w+') as fp:
for key, value in url_dict.items():
line = key + ' ==> ' + value + '\n'
fp.write(line)
print(G('[+] Successfully gen reulst text in: %s' % (path)))
if __name__ == '__main__':
#分发线程任务
thread_num = min(thread_num, len(key_list))
each_len = len(key_list) // thread_num
search_list = []
for i in range(thread_num):
if i == thread_num - 1:
search_list.append(key_list[i*each_len:])
else:
search_list.append(key_list[i * each_len: i * each_len + each_len])
print (search_list)
pool = Pool(processes=thread_num)
pool.map(search_page, search_list)
pool.join() # 主进程阻塞等待子进程的退出
gen_result_txt()
print (url_dict)
pool = Pool(processes=thread_num)
pool.map(download_page, search_list)