dl_images_gt.py
#!/usr/bin/env python3 import os import sys import datetime import pandas as pd import requests from requests import Session from requests.packages.urllib3.util import Retry from requests.adapters import HTTPAdapter import time import logging from logging.handlers import RotatingFileHandler import re from clickhouse_driver import Client from multiprocessing import Process from multiprocessing import cpu_count import multiprocessing def get_images_url(): try: cursor = Client(host='127.0.0.1', port=9001, password='Ys_gz@2022') except: logging.info("连接失败!") sys.exit(1) #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1 from yisa_oe.vehicle_all where (license_plate2 like '粤Z%澳' or license_plate2 LIKE '粤Z%港') and date = '2022-06-26'" #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1 from yisa_oe.vehicle_all where license_plate2 LIKE '粤Z%港' and date = '2022-06-29'" sql = "select image_url1,capture_time from yisa_oe.face_all where hair_id = 1 and (clarity_id = 1 or clarity_id = 2) limit 20000" try: results = cursor.execute(sql) except: logging.error("语句执行错误!") sys.exit(1) data_list = [] #查询某天的港澳牌的行 for row in results: row_list = list(row) logging.info(row) data_list.append(row_list) #df = pd.DataFrame(data_list,columns=['license_plate2','xgbdp','ambdp','VIIDOBJECTID','location_id','capture_time','image_url1']) df = pd.DataFrame(data_list,columns=['image_url1','capture_time']) return df['image_url1'],df['capture_time'] def download_img(img_url,num,result_path,ct,access_fail_c): os.chdir(result_path) # 以url命名 #img_list = re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url) #img_name = img_list[0][0]+img_list[0][1] img_name = '_' + ct + '_' + str(num) + '.jpg' retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500]) try: with Session() as s: s.mount('http://',HTTPAdapter(max_retries=retries)) img_obj = s.get(img_url) except: access_fail_c.value = access_fail_c.value + 1 logger.error("connect fail {} ".format(img_url)) logger.info("child_process {} exited... ".format(num)) sys.exit(1) #排除404的情况 if int(img_obj.status_code) != 200: access_fail_c.value = access_fail_c.value + 1 logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code)) else: try: with open(img_name,'wb') as f: f.write(img_obj.content) logger.info("saved success {} staus = {}".format(img_url,img_obj.status_code)) except: access_fail_c.value = access_fail_c.value + 1 logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code)) logger.info("child_process {} exited... ".format(num)) def start_process(): access_fail_c = multiprocessing.Value('d',0) image_url,cts = get_images_url() process_list = [] i = 0 result_path = os.path.dirname(os.path.abspath(__file__)) + "/result_images/" for url in image_url: if len(process_list) == cpu_count(): while True: #time.sleep(1) flag = 0 for p in process_list: if not p.is_alive(): process_list.remove(p) flag = 1 if flag == 1: break #print(str(cts[i])) #time_int = time.mktime(time.strptime(str(cts[i]),'%Y-%m-%d %H:%M:%S')) #time_obj = time.localtime(time_int) #cts[i] = time.strftime("%Y-%m-%d_%H-%M-%S",time_obj) Pro = Process(target=download_img,args=(url,i,result_path,cts[i],access_fail_c)) logger.info("child_process {} started... ".format(i)) Pro.start() process_list.append(Pro) i = i + 1 for p in process_list: p.join() if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.DEBUG) handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log") formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) start_process()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)