import re
import random
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
# ------------------------- 制作英文词典 --------------------------------------
rex = re.compile(r'[-&()/\.]+')
def bar(url):
response = requests.get(url=url)
soup = BeautifulSoup(response.text, 'html.parser')
ul_obj = soup.find(name='ul', attrs={'class', 'columns2 browse-list'})
return ul_obj.find_all(name='a')
def worker(url):
"""
拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a
如上链接,是所有以a开头的单词集合
"""
a_list = bar(url='https://www.collinsdictionary.com/browse/english/words-starting-with-{}'.format(url[0]))
for item in a_list:
for i in bar(item.get('href')):
res = i.text
if not re.findall(rex, res) and len(res) > 2:
print(res)
url[1].write('{}\n'.format(res))
def spider_collins():
"""
爬取柯林斯网站所有的单词,链接深度共三层,
第一层获取24个字母的连接,
第二层获取以字母开头的所有短语或单词,
第三层,就是具体的一个个单词了
"""
f = open('w.txt', 'a', encoding='utf8')
t = ThreadPoolExecutor(cpu_count() * 5)
for i in range(ord('a'), ord('z') + 1): # 97 ~ 122
t.submit(worker, (chr(i), f))
# break
t.shutdown()
f.close()