开启多线程-爬取科斯林词典
import re import random import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from multiprocessing import cpu_count # ------------------------- 制作英文词典 -------------------------------------- rex = re.compile(r'[-&()/\.]+') def bar(url): response = requests.get(url=url) soup = BeautifulSoup(response.text, 'html.parser') ul_obj = soup.find(name='ul', attrs={'class', 'columns2 browse-list'}) return ul_obj.find_all(name='a') def worker(url): """ 拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a 如上链接,是所有以a开头的单词集合 """ a_list = bar(url='https://www.collinsdictionary.com/browse/english/words-starting-with-{}'.format(url[0])) for item in a_list: for i in bar(item.get('href')): res = i.text if not re.findall(rex, res) and len(res) > 2: print(res) url[1].write('{}\n'.format(res)) def spider_collins(): """ 爬取柯林斯网站所有的单词,链接深度共三层, 第一层获取24个字母的连接, 第二层获取以字母开头的所有短语或单词, 第三层,就是具体的一个个单词了 """ f = open('w.txt', 'a', encoding='utf8') t = ThreadPoolExecutor(cpu_count() * 5) for i in range(ord('a'), ord('z') + 1): # 97 ~ 122 t.submit(worker, (chr(i), f)) # break t.shutdown() f.close()