# 域名过滤
import requests
import re
import multiprocessing


class Get_url(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36",
}

# 读取本地域名列表:包括以m.开头的和以www.开头的
def url_list(self, q):
url_list = list()
with open('../domain_one/jg.txt', 'r', encoding='utf-8') as f:
data = f.readlines()
for da in data:
q.put(data)

"""
1.读取本地的域名
2.逐个进行访问
3.对访问成功的域名进行过滤 生成以m.开头或者www.开头域名
4.保存到本地文件
"""

def get_url_list(self, q):
while True:
url_list = q.get()
for url in url_list:
ur = "https://" + url
try:
response = requests.get(url=ur, headers=self.headers, verify=False, timeout=2)
response.raise_for_status()
if response.status_code == 200:
print(ur)
# 域名过滤
ur = re.sub('https://', '', ur)
# for u in ur:
with open('../domain_two/domain.txt', 'a+', encoding='utf-8') as f:
f.write(ur)
except requests.ConnectTimeout:
print('超时!')
except requests.HTTPError:
print('http状态码非200')
except Exception as e:
print('未进行容错处理的情况:', e)

if q.empty():
break

def main(self):
q = multiprocessing.Queue()
p1 = multiprocessing.Process(target=self.url_list, args=(q,))
p2 = multiprocessing.Process(target=self.get_url_list, args=(q,))
p1.start()
p2.start()


# 方法的调用
if __name__ == '__main__':
g = Get_url()
for i in range(10):
g.main()
posted on 2019-08-28 19:00  Yihan_07  阅读(392)  评论(0编辑  收藏  举报