批量爬取相关a标签
近期需要内网获取员工是否正确添加对应的超链接
挨个点开域名查太慢了
for循环抓a标签又太费事了
学个多线程来优化一下查询【笔记笔记,套用框架的时候需要做一些修改】
#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright (C) 2023 - 2023 zhoujt, Inc. All Rights Reserved # @Date : 2023/5/25 13:51 # @Author : zhoujt # @Email : xxxx@126.com # @FileName: domain_test.py # @IDE : PyCharm import urllib import requests import json from bs4 import BeautifulSoup import requests def get_beian(url): url = "https://" + url # 发送 GET 请求并获取响应,超时时间1s,不通返回E response = requests.get(url, timeout=1) try: # 使用 BeautifulSoup 解析 HTML 内容 soup = BeautifulSoup(response.content, 'html.parser') # 查找所有链接 links = soup.find_all('a') for link in links: # 检查链接是否包含目标链接 if link.get('href') == "http://beian.11c.cn": print("正常: ", link.text) # 打印链接文本 elif link.get('href') == "http://beian.11c.cn/": print("链接少个s: ", link.text) elif link.get('href') == "https://beian.11c.cn": print("正常: ", link.text) elif link.get('href') == "https://beian.11c.cn/": print("正常: ", link.text) else: continue except Exception as e: continue if __name__ == '__main__': # 设置要获取源码的网站 URL fcheck = open("./doaminlist", "r", encoding="utf-8") for fc in fcheck.readlines(): fc = fc.strip() # 设置要获取源码的网站 URL # 创建多个线程,并逐个获取每个网站的title for i in range(10): # 为每个线程指定要获取的网站的 URL thread_url = urls[i] url1 = urls[i] title_thread = threading.Thread(target=get_beian(url1), args=(thread_url,)) title_thread.start() # 等待线程完成 title_thread.join() fcheck.close()