Python基础(十三)爬虫demo
美女图片网
访问分析网站
需要从下图中找到当前页
在从下图中拿出图片地址
具体代码如下:
import urllib.request
import os
import re
def url_open(url):
# 不使用代理
# req = urllib.request.Request(url)
# req.add_header('User-Host','ptlogin2.qq.com')
# req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER')
# response = urllib.request.urlopen(url)
# html = response.read()
# 使用代理
proxy_support = urllib.request.ProxyHandler({'http': '116.62.134.173:9999'})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36')]
response = opener.open(url)
html = response.read()
return html
# 取地址页面是第几页
# <span class="current">5</span>
def get_page(url):
html = url_open(url).decode('utf-8')
# print(html)
lastpath = re.search(r'<a class="this" href="javascript:void\(0\)" >\d{1,3}</a>', html)
# print(lastpath)
lastpath = re.search(r'[1-9]|[1-9]\d', lastpath.group(0))
# print(lastpath)
return lastpath.group(0)
def find_imgs(url):
html = url_open(url).decode('utf-8')
# <img class="magazine_img" src="https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg" alt="[U385]张馨彤">
img_addrs = re.findall(r'https://img\.ugirls\.tv/uploads/magazine/cover/.+\.jpg', html)
print(img_addrs)
return img_addrs
def save_imgs(img_addrs):
# https://img.ugirls.tv/uploads/magazine/cover/e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
for each in img_addrs:
# e5b7e22d23a9548de3b9bea7b5f1db55_cover_web_l.jpg
filename = each.split('/')[-1]
try:
img = url_open(each)
with open(filename, 'wb') as f:
f.write(img)
except OSError as reason:
print(reason)
continue
#pages下载几页的数据
def download_mm(folder='pic', pages=2 ):
url = "https://www.ugirls.com/Content/"
page_num = int(get_page(url))
try:
os.mkdir(folder)
except:
print('目录已存在')
finally:
os.chdir(folder)
for i in range(0, pages):
folderTemp = folder + str(page_num)
try:
os.mkdir(folderTemp)
os.chdir(folderTemp)
except:
# 如果目录存在,直接跳过,有可能是上次下载的时候出错或暂停了
print('目录已存在')
continue
page_url = url +"Page-"+ str(page_num) + ".html"
img_addrs = find_imgs(page_url)
save_imgs(img_addrs)
os.chdir(os.pardir)
page_num += 1
if __name__ == '__main__':
download_mm()
您的资助是我最大的动力!
金额随意,欢迎来赏!