Web安全学习笔记之ChromeDriver驱动的站点信息精准获取与利用工具

0x00 转载

本文转载与神农sec公共号

 

0x01 开发思路

  • 采用chromedriver模拟用户的正常行为

  • 对需要获取的信息采用xpath进行数据处理

  • 数据在爬取过程中可能也会遗漏,对获取不到数据的域名我们将重复调用一次避免遗漏

  • 使用tqdm来显示进度条优化用户体验

  • 使用多线程提高爬取速度

  • 对数据进行主域名处理再去重(包括ip地址)优化爬取机制

 

0x02 开发设计

  • 针对以上开发思路,我们需要设计几个方法来实现

  • 用户指定.txt文件说明要爬取的域名,读取并创建列表存储

  • 对列表中的数据进行处理(主域名提取,去ip,去重)

  • 创建线程池准备进行数据爬取(线程来调用我们下面的爬取函数)

  • 设计域名查询的方法,采用chromedriver实现,需要有两个,一个爬取不到再调用另一个

  • 最后将查询到的结果写入.csv文件

 

0x03 代码实现

import argparse
import random
import re
import sys
import datetime
import time
from tld import get_fld
from rich import print as rprint
import requests
from lxml import etree
from datetime import datetime
import threading
import requests
from queue import Queue
from tqdm import tqdm
import concurrent.futures
import tldextract
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import time
import logging
chrome_options = webdriver.ChromeOptions()
# 设置ChromeOptions,以便在后台运行
# chrome_options = Options()
chrome_options.add_argument('--headless')  # 不显示浏览器窗口
chrome_options.add_argument('--no-sandbox')  # 在Linux环境下去除沙箱模式
chrome_options.add_argument('--log-level=3')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
#格式化输出
def error(date, body):
    rprint("[[bold green]" + date + "[/bold green]] [[bold red]Error[/bold red]] > " + body)
def info(date, body):
    rprint("[[bold green]" + date + "[/bold green]] [[bold blue]Info[/bold blue]] > " + body)
def prompt(date, body):
    rprint("[[bold green]" + date + ": " + "[[bold blue]" + body +"[/bold blue]]" )
def file_Read(path):
    try:
        datas = []
        with open(path, 'r', encoding='utf-8') as file:
            while True:
                data = file.readline()
                if not data:  # 如果读取到空字符串,表示文件已经读完
                    break
                else:
                    datas.append(data)
            return datas
    except Exception as e:
        error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"),'文件读取出错,请检查传入路径及文件类型!')
        info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"),'程序已退出')
        sys.exit()
def data_Processing(datas):
    domains_one = []
    for data in tqdm(datas, desc="数据初始化"):
        try:
            domain_name = tldextract.extract(data)
            domain_name = f"{domain_name.domain}.{domain_name.suffix}"
            domains_one.append(domain_name)
        except Exception as e:
            continue
    # print(domains_one)
    ip_pattern = re.compile(
        r'(?:\d{1,3}\.){3}\d{1,3}'  # IPv4
        r'|(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}'  # 简化的 IPv6(不包含压缩表示)
        r'|(?:[A-Fa-f0-9]{1,4}:){1,7}:?'  # 包含压缩表示的 IPv6(部分匹配)
    )
    def contains_ip_address(s):
        return bool(ip_pattern.search(s))
    domains = [item for item in domains_one if not contains_ip_address(item)]
    return domains
def Weight_query_B(domain):
    try:
        target = 'https://www.aizhan.com/cha/' + domain
        service = Service(r'./chromedriver.exe',service_log_path=logFilename)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.get(target, )
        WebDriverWait(driver, 10)
        page_source = driver.page_source
        wait = WebDriverWait(driver, 10)  # 设置超时时间为30秒
        driver.quit()
        tree = etree.HTML(page_source)
        baidu = tree.xpath('//*[@id="baidurank_br"]/img/@alt')[0]
        yidong = tree.xpath('//*[@id="baidurank_mbr"]/img/@alt')[0]
        three60 = tree.xpath('//*[@id="360_pr"]/img/@alt')[0]
        SM = tree.xpath('//*[@id="sm_pr"]/img/@alt')[0]
        sogou = tree.xpath('//*[@id="sogou_pr"]/img/@alt')[0]
        google = tree.xpath('//*[@id="google_pr"]/img/@alt')[0]
        ICP_ICP = tree.xpath('//*[@id="icp_icp"]')[0]
        ICP_company = tree.xpath('//*[@id="icp_company"]')[0]
        line_to_write = f"{domain},{ICP_ICP.text},{ICP_company.text},{baidu},{yidong},{three60},{SM},{sogou},{google}\n"
        line_to_write = line_to_write.replace('\n', '')
        line_to_write = line_to_write + "\n"
        Query_results(line_to_write)
    except Exception as e:
        err_domains.append(domain)
def Weight_query(domain):
    try:
        target = 'https://www.aizhan.com/cha/' + domain
        service = Service(r'./chromedriver.exe', service_log_path=logFilename)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.get(target, )
        WebDriverWait(driver, 10)
        page_source = driver.page_source
        wait = WebDriverWait(driver, 10)  # 设置超时时间为30秒
        driver.quit()
        tree = etree.HTML(page_source)
        baidu = tree.xpath('//*[@id="baidurank_br"]/img/@alt')[0]
        yidong = tree.xpath('//*[@id="baidurank_mbr"]/img/@alt')[0]
        three60 = tree.xpath('//*[@id="360_pr"]/img/@alt')[0]
        SM = tree.xpath('//*[@id="sm_pr"]/img/@alt')[0]
        sogou = tree.xpath('//*[@id="sogou_pr"]/img/@alt')[0]
        google = tree.xpath('//*[@id="google_pr"]/img/@alt')[0]
        ICP_ICP = tree.xpath('//*[@id="icp_icp"]')[0]
        ICP_company = tree.xpath('//*[@id="icp_company"]')[0]
        line_to_write = f"{domain},{ICP_ICP.text},{ICP_company.text},{baidu},{yidong},{three60},{SM},{sogou},{google}\n"
        line_to_write = line_to_write.replace('\n', '')
        line_to_write = line_to_write + "\n"
        Query_results(line_to_write)
    except Exception as e:
        Weight_query_B(domain)
def Query_results(result):
    Weight_data.append(result)
def Multi_threading(domains):
    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            # 使用tqdm来显示进度条
            with tqdm(total=len(domains), desc="域名权重查询中") as progress_bar:
                # 提交所有任务到线程池,并收集Future对象
                futures = [executor.submit(Weight_query, domain) for domain in domains]
                # 遍历Future对象,当每个任务完成时更新进度条
                for future in concurrent.futures.as_completed(futures):
                    progress_bar.update(1)
    except Exception as e:
        pass
def Data_writing(Weight_data):
    try:
        filename = formatted_now + '.csv'
        with open(filename, 'a', encoding='utf-8') as file:
            file.write("domain_name,ICP,ICP_company,Baidu_weight,yidong_weight,360_weight,SM_weight,sogou_weight,google_weight\n")
            for line in tqdm(Weight_data, desc=f"正在将结果写入文件: {filename}"):
                file.write(line)
            info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f'数据写入完成,请查看文件: {filename}')
    except Exception as e:
        error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '结果写入出错了!')
def logFile():
    try:
        with open(logFilename, 'a', encoding='utf-8') as file:
            file.write("日志文件")
    except Exception as e:
        sys.exit()
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="网站权重查询")
    parser.add_argument('-u','--url',type=str,help='查询单个域名权重!')
    parser.add_argument('-f', '--file', type=str, help='批量查询域名权重!')
    args = parser.parse_args()
    if '-u' in sys.argv:
        now = datetime.now()
        formatted_now = now.strftime("%Y%m%d%H%M%S")
        logFilename = formatted_now + '.log'
        info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '欢迎使用权重查询系统!当前是3.0版本!')
        print(r'''
  _______     ____          ____  __ 
 / ____\ \   / /\ \        / /  \/  |
| |     \ \_/ /  \ \  /\  / /| \  / |
| |      \   /    \ \/  \/ / | |\/| |
| |____   | |      \  /\  /  | |  | |
 \_____|  |_|       \/  \/   |_|  |_|
''')
        domain_name = args.url
        try:
            domain_name = tldextract.extract(domain_name)
            domain_name = f"{domain_name.domain}.{domain_name.suffix}"
        except Exception as e:
            error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '域名处理出错,请检查传入的域名!')
        try:
            Weight_data = []
            Weight_query(domain_name)
            Weight_data = Weight_data[0].replace('\n', '')
            Weight_data = Weight_data.split(',')
            print(Weight_data)
            info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '查询结果如下:')
            prompt('域名', Weight_data[0])
            prompt('ICP',Weight_data[1])
            prompt('企业名称', Weight_data[2])
            prompt('百度权重', Weight_data[3])
            prompt('移动权重', Weight_data[4])
            prompt('360权重', Weight_data[5])
            prompt('神马', Weight_data[6])
            prompt('搜狗', Weight_data[7])
            prompt('谷歌PR', Weight_data[8])
        except Exception as e:
            error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '啊哦!好像遇到点问题··· ···')
    elif '-f' in sys.argv:
        print(r'''
          _______     ____          ____  __ 
         / ____\ \   / /\ \        / /  \/  |
        | |     \ \_/ /  \ \  /\  / /| \  / |
        | |      \   /    \ \/  \/ / | |\/| |
        | |____   | |      \  /\  /  | |  | |
         \_____|  |_|       \/  \/   |_|  |_|
''')
        info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '欢迎使用权重查询系统!当前是3.0版本!')
        err_domains = []
        now = datetime.now()
        formatted_now = now.strftime("%Y%m%d%H%M%S")
        logFilename = formatted_now + '.log'
        # results_queue = Queue()
        # thread_lock = threading.Lock()
        Weight_data = []
        datas = file_Read(args.file)
        domains = data_Processing(datas)
        domains = list(set(domains))
        Multi_threading(domains)
        info(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '域名: '+ str(err_domains) + '查询不到结果!')
        Data_writing(Weight_data)
    else:
        print(r'''
  _______     ____          ____  __ 
 / ____\ \   / /\ \        / /  \/  |
| |     \ \_/ /  \ \  /\  / /| \  / |
| |      \   /    \ \/  \/ / | |\/| |
| |____   | |      \  /\  /  | |  | |
 \_____|  |_|       \/  \/   |_|  |_|
''')
        error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '不是这样用的! --help 看看……')

 

posted @ 2024-12-04 14:06  时光飞逝,逝者如斯  阅读(11)  评论(0编辑  收藏  举报