def parsePageForImages(data):
img_urls = []
soup = BeautifulSoup(data, 'html.parser')
images = soup.find_all('img')
for img in images:
src = img.get('src') or img.get('data-src')
if src and not src.startswith('http'):
src = 'http:' + src
if src:
img_urls.append(src)
return img_urls
它会检查src和data-src属性,确保每个URL都是完整的。
这个函数用于并发下载多张图片
defdownloadImages(img_urls, path='./candyimages/', limit=38):
ifnot os.path.exists(path):
os.makedirs(path)
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(downloadImage, url, path): url for url in img_urls[:limit]}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
future.result() # 获取执行结果except Exception as e:
print(f"{url} generated an exception: {e}")
· [翻译] 为什么 Tracebit 用 C# 开发
· 腾讯ima接入deepseek-r1,借用别人脑子用用成真了~
· Deepseek官网太卡,教你白嫖阿里云的Deepseek-R1满血版
· DeepSeek崛起:程序员“饭碗”被抢,还是职业进化新起点?
· RFID实践——.NET IoT程序读取高频RFID卡/标签