python 获取网页内容新增网页分类+删除指定后缀数组元素功能(基于python 3.6)

from urllib.parse import urljoin
import urllib.request

from bs4 import BeautifulSoup
import time
import os
import re
import errno


def mkdir_p(path): # 递归创建多级目录
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise


def get_link(page): # 寻找链接的href
linkData = []
for page in page.find_all('td'):
links = page.select("a")
for each in links:
# if str(each.get('href'))[:1] == '/': 过滤if代码
data = each.get('href')
linkData.append(data)
return (linkData)


def gain(url): # 获取网页指定内容
try:
page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml') # 利用soup获取网页内容
links = get_link(soup) # 获取<a href= ? 内容
return links
except:
print('无法获取该链接:' + url)
return 1




def main():
url = 'http://weather.unisys.com/hurricane/index.php'
Download_dir = 'E:\\Typhoon_data\\Data' #download path
Web_Link = gain(url)

for Link in range(len(Web_Link)):
Link_Add = Web_Link[Link]
Link_One = re.split("/", Link_Add) # 去除'/',将Link_Add变成数组
Ocean_Folder = Link_One[0] # 获取数组第1位值
Ocean_Time = Link_One[1] # 获取数组第2位值
url_Typhoon = 'http://weather.unisys.com/hurricane/'
_connet = urljoin(url_Typhoon, Link_Add)
Web_Link_ = gain(_connet)

# 删除多余gif链接
Gifdata = []
for Gif in range(len(Web_Link_)):
Gifdata_ = Web_Link_[Gif]
findGif = re.findall(r'.gif$', Gifdata_, re.I)
if findGif:
Gifdata.append(Gifdata_)
# print(Gifdata)
else:
continue
for _Gif in range(len(Gifdata)):
Web_Link_.remove(Gifdata[_Gif])

time.sleep(3)
if Ocean_Time != 'index.php':
for Link_A in range(len(Web_Link_)):
Link_Add_ = Web_Link_[Link_A]
Link_part = re.split("/", Link_Add_) # 去除'/',将Link_Add变成数组
Ocean_dataName = Link_part[0] # 获取dataName
url_Data = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time + "/"
connet_ = urljoin(url_Data, Link_Add_)

time.sleep(1)
# 下载数据
Ocean_dataFile = (Ocean_dataName + '.json')
file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time + "/") # 拼接绝对路径
mkdir_p(file)
print(connet_)
if os.path.isfile(file + Ocean_dataFile):
print('文件已存在')
else:
try:
url =connet_
wp = urllib.request.urlopen(url) # 打开数据网页数据
content = wp.read()

fp = open(file + Ocean_dataFile, "wb") # 写入指定文件夹
fp.write(content) # 写入数据
fp.close() # 关闭文件
except:
print('无法获取该链接:' + url)
continue





else:
for Link_B in range(len(Web_Link_)):
_Link_Add = Web_Link_[Link_B]
Link_part_ = re.split("/", _Link_Add) # 去除'/',将Link_Add变成数组
Ocean_Time_ = Link_part_[1] # 获取数组第2位值,年份
url_Typhoon_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/"
Connet = urljoin(url_Typhoon_, _Link_Add)
_Web_Link = gain(Connet)
time.sleep(3)

# 删除多余gif链接
_Gifdata = []
for _Gif_ in range(len(_Web_Link)):
_Gifdata_ = _Web_Link[_Gif_]
findGif = re.findall(r'.gif$', _Gifdata_, re.I)
if findGif:
_Gifdata.append(_Gifdata_)
# print(Gifdata)
else:
continue
for _Gif in range(len(_Gifdata)):
_Web_Link.remove(_Gifdata[_Gif])

for Link_B_ in range(len(_Web_Link)):
_Link_Add_ = Web_Link[Link_B_]
_Link_part_= re.split("/", _Link_Add_) # 去除'/',将Link_Add变成数组
_Ocean_dataName_ = _Link_part_[0] # 获取数组第1位值
url_Data_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time_ + "/"
Connet_ = urljoin(url_Data_, _Link_Add_)
time.sleep(1)
# 下载数据

Ocean_dataName = (_Ocean_dataName_ + '.json')
file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time_ + "/" ) # 拼接绝对路径
mkdir_p(file)
print(Connet_)
if os.path.isfile(file + Ocean_dataName):
print('文件已存在')
else:
try:
url = Connet_
wp = urllib.request.urlopen(url) # 打开数据网页数据
content = wp.read()
fp = open(file + Ocean_dataName
, "wb") # 写入指定文件夹
fp.write(content) # 写入数据
fp.close() # 关闭文件
except:
print('无法获取该链接:' + url)
continue

if __name__ == '__main__':
main()
 


posted @ 2018-03-13 15:08  Moucong  阅读(424)  评论(0编辑  收藏  举报