柯林斯
原文地址:# https://www.cnblogs.com/dylan9/p/9207366.html
python代码:
1 # 关于线程以及进程的使用 2 #文件名:sample.py 3 import time 4 5 import requests 6 from lxml import etree 7 from multiprocessing.dummy import Pool 8 headers = { 9 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" 10 } 11 12 # url = "https://www.collinsdictionary.com/zh/browse/english/" 13 # 14 # page_text = requests.get(url=url, headers=headers).text 15 # 16 # tree = etree.HTML(page_text) 17 # 18 # li_list = tree.xpath("//ul[@class='bLtr']/li/a/@href")[1:] 19 pool = Pool(20) 20 21 li_list = ['https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-b', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-c', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-d', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-e', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-f', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-g', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-h', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-i', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-j', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-k', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-l', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-m', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-n', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-o', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-p', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-q', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-r', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-s', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-t', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-u', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-v', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-w', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-x', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-y', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-z'] 22 23 # li_list = ["https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a"] 24 25 deep_url_list = [] 26 27 start = time.time() 28 29 def get_urls(url): 30 page_text2 = requests.get(url=url, headers=headers).text 31 tree2 = etree.HTML(page_text2) 32 url_list = tree2.xpath("//ul[@class='columns2 bL']/li/a/@href") 33 deep_url_list.extend(url_list) 34 35 36 def get_data(url): 37 page_text3 = requests.get(url=url, headers=headers).text 38 tree3 = etree.HTML(page_text3) 39 data_li_list = tree3.xpath("//ul[@class='columns2 bL']/li") 40 for li in data_li_list: 41 data = li.xpath('./a/text()')[0] 42 with open("word2.txt", "a", encoding="utf-8") as f: 43 f.write(data + '\n') 44 45 46 pool.map(get_urls, li_list) 47 result = pool.map_async(get_data, deep_url_list) 48 result.wait() 49 print("执行完毕") 50 print("耗时:", time.time()-start)
windown下安装python,安装pip、安装requests包,结果没有用
1 import re 2 import random 3 import requests 4 5 from bs4 import BeautifulSoup 6 from concurrent.futures import ThreadPoolExecutor 7 from multiprocessing import cpu_count 8 9 # ------------------------- 制作英文词典 -------------------------------------- 10 11 rex = re.compile(r'[-&()/\.]+') 12 13 14 def bar(url): 15 response = requests.get(url=url) 16 soup = BeautifulSoup(response.text, 'html.parser') 17 ul_obj = soup.find(name='ul', attrs={'class', 'columns2 browse-list'}) 18 return ul_obj.find_all(name='a') 19 20 21 def worker(url): 22 """ 23 拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a 24 如上链接,是所有以a开头的单词集合 25 """ 26 a_list = bar(url='https://www.collinsdictionary.com/browse/english/words-starting-with-{}'.format(url[0])) 27 for item in a_list: 28 for i in bar(item.get('href')): 29 res = i.text 30 if not re.findall(rex, res) and len(res) > 2: 31 print(res) 32 url[1].write('{}\n'.format(res)) 33 34 35 def spider_collins(): 36 """ 37 爬取柯林斯网站所有的单词,链接深度共三层, 38 第一层获取24个字母的连接, 39 第二层获取以字母开头的所有短语或单词, 40 第三层,就是具体的一个个单词了 41 """ 42 f = open('w.txt', 'a', encoding='utf8') 43 t = ThreadPoolExecutor(cpu_count() * 5) 44 for i in range(ord('a'), ord('z') + 1): # 97 ~ 122 45 t.submit(worker, (chr(i), f)) 46 # break 47 t.shutdown() 48 f.close()
差不多一个意思吧,还要消化下
pip install requests
等待系统自动加载安装。
人就像是被蒙着眼推磨的驴子,生活就像一条鞭子;当鞭子抽到你背上时,你就只能一直往前走,虽然连你也不知道要走到什么时候为止,便一直这么坚持着。