批量爬取相关a标签

近期需要内网获取员工是否正确添加对应的超链接

挨个点开域名查太慢了

for循环抓a标签又太费事了

学个多线程来优化一下查询【笔记笔记,套用框架的时候需要做一些修改】

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2023 - 2023 zhoujt, Inc. All Rights Reserved 
# @Date    : 2023/5/25 13:51
# @Author  : zhoujt
# @Email   : xxxx@126.com
# @FileName: domain_test.py
# @IDE     : PyCharm
import urllib
import requests
import json
from bs4 import BeautifulSoup
import requests

def get_beian(url):
    url = "https://" + url
     # 发送 GET 请求并获取响应,超时时间1s,不通返回E
    response = requests.get(url, timeout=1) 
    try:
        # 使用 BeautifulSoup 解析 HTML 内容
        soup = BeautifulSoup(response.content, 'html.parser')
        # 查找所有链接
        links = soup.find_all('a')
        for link in links:
             # 检查链接是否包含目标链接
            if link.get('href') == "http://beian.11c.cn": 
                print("正常: ", link.text)  # 打印链接文本
            elif link.get('href') == "http://beian.11c.cn/":
                print("链接少个s: ", link.text)
            elif link.get('href') == "https://beian.11c.cn":
                print("正常: ", link.text)
            elif link.get('href') == "https://beian.11c.cn/":
                print("正常: ", link.text)
            else:
                continue
    except Exception as e:
        continue


if __name__ == '__main__':
    # 设置要获取源码的网站 URL
    fcheck = open("./doaminlist", "r", encoding="utf-8")
    for fc in fcheck.readlines():
        fc = fc.strip()
        # 设置要获取源码的网站 URL
        # 创建多个线程,并逐个获取每个网站的title
        for i in range(10):
            # 为每个线程指定要获取的网站的 URL
            thread_url = urls[i]
            url1 = urls[i]
            title_thread = threading.Thread(target=get_beian(url1), args=(thread_url,))
            title_thread.start()
            # 等待线程完成
            title_thread.join()
    fcheck.close()

 

posted @ 2023-05-29 18:17  Security  阅读(40)  评论(0编辑  收藏  举报