python3实现网页爬虫下载图片

https://www.jb51.net/article/119178.htm

# -*- coding: UTF-8 -*-
'''
https://github.com/halibobo/runnerbar-image

https://www.jb51.net/article/69153.html
'''

import re
import urllib.request

# ------ 获取网页源代码的方法 ---
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html

# ------ getHtml()内输入任意帖子的URL ------
html = getHtml("https://tieba.baidu.com/p/5352556650")
# ------ 修改html对象内的字符编码为UTF-8 ------
html = html.decode('UTF-8')

# ------ 获取帖子内所有图片地址的方法 ------
def getImg(html):
# ------ 利用正则表达式匹配网页内容找到图片地址 ------
reg = r'src="([.*\S]*\.jpg)"'
imgre = re.compile(reg);
imglist = re.findall(imgre, html)
return imglist

imgList = getImg(html)


##download_url

##url --json提取url
imgList_test={"https://gjnkw6amjgits2kc304.exp.bcedocument.com/target/doc-hivu9jd6p2i4siw/doc-hivu9jd6p2i4siw/0.jpg?x-bce-range=0-191200",
"https://gjnkw6amjgits2kc304.exp.bcedocument.com/target/doc-hivu9jd6p2i4siw/doc-hivu9jd6p2i4siw/0.jpg?x-bce-range=191200-411580",
"https://gjnkw6amjgits2kc304.exp.bcedocument.com/target/doc-hivu9jd6p2i4siw/doc-hivu9jd6p2i4siw/0.jpg?x-bce-range=411580-564343",
"https://gjnkw6amjgits2kc304.exp.bcedocument.com/target/doc-hivu9jd6p2i4siw/doc-hivu9jd6p2i4siw/0.jpg?x-bce-range=564343-660625",
"https://gjnkw6amjgits2kc304.exp.bcedocument.com/target/doc-hivu9jd6p2i4siw/doc-hivu9jd6p2i4siw/0.jpg?x-bce-range=660625-700778"}


imgName = 0
##urllist 不会按顺序下载
for imgPath in imgList_test:
# ------ 这里最好使用异常处理及多线程编程方式 ------
try:
f = open('D:\\temp\\'+"测试技术简介第一章_"+ str(imgName)+".jpg", 'wb')
f.write((urllib.request.urlopen(imgPath)).read())
print(imgPath)
f.close()
except Exception as e:
print(imgPath+" error")
imgName += 1

print("All Done!")


#######################

# -*- coding: UTF-8 -*-
'''
https://github.com/halibobo/runnerbar-image

https://www.jb51.net/article/69153.html

'''

import bs4
import urllib
from urllib import request
from bs4 import BeautifulSoup as bs
from urllib.error import URLError, HTTPError
import xlwt
from xlwt import Workbook
import json
import requests
import time
import datetime
import os

def getRaceInfo(id,page):
url='http://m.yundong.runnerbar.com/yd_mobile/share/album.json'
para = {'activity_id':id,'page':page,'pageSize':100}
header = {}
r = requests.post(url,data=para,headers= header)
json_r = r.json()
parsed_json = json_r['album']['searchResultList']
activity = {}
items = []
count = json_r['album']['activity_photo_count']
for item in parsed_json:
items.append(item)
activity['items'] = items
activity['count'] = count
return activity


def startRace(id):
row_index = 1001
workbook = xlwt.Workbook(encoding = 'utf-8')
try:
activity = getRaceInfo(id,1)
tempdata = activity['items']
count = int(activity['count'])
if count > 0:
print(id,count)
for i in range(1,int(count/100+2)):
try:
data = getRaceInfo(id,i)['items']
for item in data:
print(item['url_hq'])
save_img(item['url_hq'],row_index,'book'+id)
row_index = row_index +1
print(row_index)
except HTTPError as e:
print('Error code: ', e.code)
except URLError as e:
print('Reason: ', e.reason)
except Exception as e:
print('错误 :',e)
except Exception as e:
print('错误 :',e)


def save_img(img_url,file_name,file_path='img'):
#保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹
try:
if not os.path.exists(file_path):
print('文件夹',file_path,'不存在,重新建立')
#os.mkdir(file_path)
os.makedirs(file_path)
#获得图片后缀
file_suffix = os.path.splitext(img_url)[1]
#拼接图片名(包含路径)
filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix)
#下载图片,并保存到文件夹中
urllib.request.urlretrieve(img_url,filename=filename)
except IOError as e:
print('文件操作失败',e)
except HTTPError as e:
print('Error code: ', e.code)
except Exception as e:
print('错误 :',e)
# 0-100 10712 7376 10765
for i in range(7835,7836):
startRace(str(i))
posted @   小强找BUG  阅读(836)  评论(0编辑  收藏  举报
编辑推荐:
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
阅读排行:
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!
点击右上角即可分享
微信分享提示