dl_images_4.py
#!/usr/bin/env python3 import os import sys import pandas as pd import requests from requests.packages.urllib3.util import Retry from requests.adapters import HTTPAdapter from requests import Session import time import logging from logging.handlers import RotatingFileHandler import re from clickhouse_driver import Client from multiprocessing import Process from multiprocessing import cpu_count import multiprocessing ''' 读取csv文件的url,多次请求url,批量下载图片, ''' def get_xg_images_url(): df = pd.read_csv('./xg_fail_rec.csv') df['license_plate2'] return df['image_url1'],df['license_plate2'],df['capture_time'] def get_am_images_url(): df = pd.read_csv('./am_fail_rec.csv') df['license_plate2'] return df['image_url1'],df['license_plate2'],df['capture_time'] def download_img(img_url,num,result_path,plateNo,ct,access_fail_c): os.chdir(result_path) # 以url命名 #img_list = re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url) #img_name = img_list[0][0]+img_list[0][1] img_name = plateNo + '_' + ct + '+' + str(num) + '.jpg' retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500]) try: with Session() as s: s.mount('http://',HTTPAdapter(max_retries=retries)) img_obj = s.get(img_url) except: access_fail_c.value = access_fail_c.value + 1 logger.error("connect fail {} ".format(img_url)) logger.info("child_process {} exited... ".format(num)) sys.exit(1) if int(img_obj.status_code) != 200: access_fail_c.value = access_fail_c.value + 1 logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code)) else: try: with open(img_name,'wb') as f: f.write(img_obj.content) logger.info("saved success {} staus = {}".format(img_url,img_obj.status_code)) except: access_fail_c.value = access_fail_c.value + 1 logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code)) logger.info("child_process {} exited... ".format(num)) def start_process(): access_fail_c = multiprocessing.Value('d',0) # 下载港牌图片 image_url,license_plate,cts = get_xg_images_url() print('港牌url总数: {}'.format(len(image_url))) process_list = [] i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_xg/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() # 下载澳牌图片 image_url,license_plate,cts = get_am_images_url() print('澳牌url总数: {}'.format(len(image_url))) i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/fail_images/fail_am/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break Pro = Process(target=download_img,args=(url,i,result_path,license_plate[i],cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() print('访问图片失败总数:{}'.format(access_fail_c.value)) if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.INFO) handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log") formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) start_process()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)