python3 获取博彩网站页面下所有域名(批量)

已有的域名信息

详细实现过程如下

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup as Bs4
from urllib.parse import urlparse

headers= {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

#打开域名文件1.txt
def new_url():
    url_list  = []
    bo = open("1.txt","r")
    for i in bo:
        url_list.append(i.replace("\n",""))
    return(url_list)


#数据处理
def get_url():
    head_url = new_url()
    num = 0
    for i in head_url: #按行遍历数据
        num = num +1
        print("***********************************"+ i +"***********************************")
        # head_url = "https://www.tkcp.hk/"
        try:
            response = requests.get(url="http://"+i,headers=headers)
            response.encoding = 'gb2312'
            soup = Bs4(response.text,"lxml")
            # print(soup)
            htmls = soup.find_all("a") #获取页面中的所有a标签
            # print(htmls)
            urls = []
            new_urls = []
            for html in htmls:
                url = html.get("href") #获取页面中所有含"href"的字符串
                urls.append(url.replace('\n',''))
                qc_urls = set(urls)
            for url in qc_urls: #处理数据,得到域名地址
                if "http" in url:
                    res = urlparse(url)
                    # print("返回对象:", res)
                    # print("域名", res.netloc)
                    domain = res.netloc
                    new_urls.append(domain)
            qc_new_urls = set(set(new_urls))
            #print("***********************************"+num+"***********************************")
            print(set(qc_new_urls)) #去重
            for j in set(qc_new_urls):
                # print(j)
                with open("url_v1.txt","a+",encoding="utf-8") as f:
                    f.write(j+"\n")
        except Exception as e:
            print("链接无法访问")
    result_list = []
    result = open("./url_v1.txt","r")  
    for r in result.readlines(): 
        result_list.append(r.replace("\n",""))
    for x in set(result_list): #二次数据处理,去掉重复数据
        with open("url_end_V.txt","a+",encoding="utf-8") as f:
            print(x)
            f.write(x+"\n")

if __name__=="__main__":
    get_url()
posted @ 2019-11-06 16:22  g7y12  阅读(556)  评论(0编辑  收藏  举报