pytho获取哈佛大学医学院图片
Python 查找关于哈佛大学医学院图片用于机器学习,
以下代码仅可用于个人学习,不得用于商用!
暂未发现robots.txt文件,如果有侵权请告知删除
1、运行后会在当前目录下建立"harvard_picture"的文件夹,
2、然后会依次获取对应的网站图片下载到此文件夹中
3、pathRoot是指的递归层级,层级越大表示获得的图片越完整但是可能会遍历比较久,层级越低速度越快但是数据会不太完整
# -*- coding: utf-8 -*-
# coding:unicode_escape
#Created on 2022年2月17日
#Copyright (C) Joker Rights Reserved
#@author: Joker
#1、search picture from http://www.med.harvard.edu/AANLIB/home.html
#2、download picture from web
from re import A
import sys
import os
import time
from urllib import response
import requests
from bs4 import BeautifulSoup
def FindPicture(picture_url,pathRoot,pictureName):
# picture_url=url_home+ls_result_picture.attrs.get('href')#ls_result_picture['href']
picture_url=picture_url.upper()
if picture_url.find('PNG')!=-1 or picture_url.find('GIF')!=-1 or picture_url.find('JPEG')!=-1 or picture_url.find('MPG')!=-1:
response_picture = requests.get(picture_url, stream=True)
if response_picture.status_code == 200:#请求图片返回正常
ls_result_pictureArr=pictureName.split('/')#截图图片名字
fieldName=ls_result_pictureArr.pop()
DownLoadpictureName='./harvard_picture/'+fieldName#下载后的图片名字
open(DownLoadpictureName, 'wb').write(response_picture.content) # 将内容写入图片
print(fieldName+" Download Completely to "+os.getcwd()+"\\harvard_picture\\"+fieldName)
def FindHtml(url,pathRoot):
pathRoot=pathRoot+1
if(pathRoot>=10):
return
url_home="http://www.med.harvard.edu/AANLIB/"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7'
}
lv_response = requests.get(url, headers=header)
lv_soup = BeautifulSoup(lv_response.text, 'html.parser')
result_a = lv_soup.find_all('a')
for ls_result_a in result_a:
if ls_result_a.attrs.get('href')!=None:
url_web=url_home+ls_result_a.attrs.get('href')
if ls_result_a.attrs.get('href').find('htm')!=-1:
FindHtml(url_web,pathRoot)
else:
FindPicture(url_web,pathRoot,ls_result_a.attrs.get('href'))
#mian function
def main():
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7'
}
try:
os.mkdir("harvard_picture")
except IOError:
print("文件夹已存在,无需创建")
url_home="http://www.med.harvard.edu/AANLIB/"
url_web=url_home+'/home.html'
lv_response = requests.get(url_web, headers=header)
lv_soup = BeautifulSoup(lv_response.text, 'html.parser')
result_a = lv_soup.find_all('a')
htmlArray=[]
for ls_result_a in result_a:
if ls_result_a.attrs.get('href')!=None:
# print(url_home+ls_result_a['href'])
url_web=url_home+ls_result_a['href']
pathRoot=0
if ls_result_a.attrs.get('href').find('html')!=-1:
FindHtml(url_web,pathRoot)
else:
FindPicture(url_web,pathRoot)
if __name__ == '__main__':
start = time.perf_counter()
main()
end = time.perf_counter()
print("Start:%s,End:%s", start,end)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架