pytho获取哈佛大学医学院图片

Python 查找关于哈佛大学医学院图片用于机器学习,
以下代码仅可用于个人学习,不得用于商用!

暂未发现robots.txt文件,如果有侵权请告知删除
1、运行后会在当前目录下建立"harvard_picture"的文件夹,
2、然后会依次获取对应的网站图片下载到此文件夹中
3、pathRoot是指的递归层级,层级越大表示获得的图片越完整但是可能会遍历比较久,层级越低速度越快但是数据会不太完整

# -*- coding: utf-8 -*-
# coding:unicode_escape
#Created on 2022年2月17日
#Copyright (C) Joker Rights Reserved
#@author: Joker

#1、search picture from http://www.med.harvard.edu/AANLIB/home.html
#2、download picture from web
from re import A
import sys
import os
import time
from urllib import response
import requests
from bs4 import BeautifulSoup

def FindPicture(picture_url,pathRoot,pictureName):
    # picture_url=url_home+ls_result_picture.attrs.get('href')#ls_result_picture['href']
    picture_url=picture_url.upper()
    if  picture_url.find('PNG')!=-1 or picture_url.find('GIF')!=-1 or picture_url.find('JPEG')!=-1  or picture_url.find('MPG')!=-1:
        response_picture = requests.get(picture_url, stream=True)
        if response_picture.status_code == 200:#请求图片返回正常
            ls_result_pictureArr=pictureName.split('/')#截图图片名字
            fieldName=ls_result_pictureArr.pop()
            DownLoadpictureName='./harvard_picture/'+fieldName#下载后的图片名字
            
            open(DownLoadpictureName, 'wb').write(response_picture.content) # 将内容写入图片   
            print(fieldName+" Download Completely to "+os.getcwd()+"\\harvard_picture\\"+fieldName)
def FindHtml(url,pathRoot):

    pathRoot=pathRoot+1
    if(pathRoot>=10):
        return

    url_home="http://www.med.harvard.edu/AANLIB/"
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7'
    }        
    lv_response = requests.get(url, headers=header)
    lv_soup = BeautifulSoup(lv_response.text, 'html.parser')
    result_a = lv_soup.find_all('a')

    for ls_result_a in result_a:

        if  ls_result_a.attrs.get('href')!=None:

            url_web=url_home+ls_result_a.attrs.get('href')

            if  ls_result_a.attrs.get('href').find('htm')!=-1:                
                FindHtml(url_web,pathRoot)
            else:
                FindPicture(url_web,pathRoot,ls_result_a.attrs.get('href'))
#mian function
def main():   
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7'
    }    
    try:
        os.mkdir("harvard_picture")
    except IOError:
        print("文件夹已存在,无需创建")
    url_home="http://www.med.harvard.edu/AANLIB/"
    url_web=url_home+'/home.html'
    lv_response = requests.get(url_web, headers=header)
    lv_soup = BeautifulSoup(lv_response.text, 'html.parser')
    result_a = lv_soup.find_all('a')
    htmlArray=[]
    
    for ls_result_a in result_a:
        if ls_result_a.attrs.get('href')!=None:

            # print(url_home+ls_result_a['href'])            
            url_web=url_home+ls_result_a['href']
            pathRoot=0
            if  ls_result_a.attrs.get('href').find('html')!=-1:                
                FindHtml(url_web,pathRoot)
            else:
                FindPicture(url_web,pathRoot)

if __name__ == '__main__':   

    start = time.perf_counter()
    main()
    end = time.perf_counter()
    print("Start:%s,End:%s", start,end)

posted @ 2022-02-18 11:23  linhuang  阅读(7)  评论(0编辑  收藏  举报  来源