dl_images_gt.py

 

 

复制代码
#!/usr/bin/env python3
import os
import sys
import datetime
import pandas as pd
import requests
from requests import Session
from requests.packages.urllib3.util import Retry
from requests.adapters import HTTPAdapter
import time
import logging
from logging.handlers import RotatingFileHandler
import re
from clickhouse_driver import Client
from multiprocessing import Process
from multiprocessing import cpu_count
import multiprocessing
def get_images_url():
    try:
                cursor = Client(host='127.0.0.1', port=9001, password='Ys_gz@2022')
    except:
                logging.info("连接失败!")
                sys.exit(1)
    #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1  from yisa_oe.vehicle_all where (license_plate2 like '粤Z%澳' or license_plate2 LIKE '粤Z%港') and date = '2022-06-26'"
    #sql = "select license_plate2,xgbdp,ambdp,VIIDOBJECTID,location_id,capture_time,image_url1  from yisa_oe.vehicle_all where license_plate2 LIKE '粤Z%港' and date = '2022-06-29'"
    sql = "select image_url1,capture_time from yisa_oe.face_all where hair_id = 1  and (clarity_id = 1 or clarity_id = 2) limit 20000"
    try:
                results = cursor.execute(sql)
    except:
                logging.error("语句执行错误!")
                sys.exit(1)
    data_list = [] #查询某天的港澳牌的行
    for row in results:
                row_list = list(row)
                logging.info(row)
                data_list.append(row_list)
    #df = pd.DataFrame(data_list,columns=['license_plate2','xgbdp','ambdp','VIIDOBJECTID','location_id','capture_time','image_url1'])
    df = pd.DataFrame(data_list,columns=['image_url1','capture_time'])
    return df['image_url1'],df['capture_time']
def download_img(img_url,num,result_path,ct,access_fail_c):
    os.chdir(result_path)
    # 以url命名
    #img_list =  re.findall(r'\/.*?\/.*?\/.*?\/.*?\/.*?\/.*?\/(.*\.)(JPG|jpg)',img_url)
    #img_name = img_list[0][0]+img_list[0][1]
    img_name = '_' + ct + '_' + str(num) + '.jpg'
    retries = Retry(total=10,backoff_factor=0.1,status_forcelist=[500])
    try:
                with Session() as s:
                        s.mount('http://',HTTPAdapter(max_retries=retries))
                        img_obj = s.get(img_url)
    except:
                access_fail_c.value = access_fail_c.value + 1
                logger.error("connect fail {}  ".format(img_url))
                logger.info("child_process {} exited... ".format(num))
                sys.exit(1)
    #排除404的情况
    if int(img_obj.status_code) != 200:
                access_fail_c.value = access_fail_c.value + 1
                logger.error("download {} fail staus = {}".format(img_url,img_obj.status_code))
    else:
                try:
                        with open(img_name,'wb') as f:
                                f.write(img_obj.content)
                        logger.info("saved success {}  staus = {}".format(img_url,img_obj.status_code))
                except:
                        access_fail_c.value = access_fail_c.value + 1
                        logger.error("saved fail {} fail staus = {}".format(img_url,img_obj.status_code))
    logger.info("child_process {} exited... ".format(num))
def start_process():
    access_fail_c = multiprocessing.Value('d',0)
    image_url,cts = get_images_url()
    process_list = []
    i = 0 
    result_path = os.path.dirname(os.path.abspath(__file__)) + "/result_images/"
    for url in image_url:
            if len(process_list) == cpu_count():
                while True:
                    #time.sleep(1)
                    flag = 0 
                    for p in process_list:
                        if not p.is_alive():
                            process_list.remove(p)
                            flag = 1 
                    if flag == 1:
                        break
            #print(str(cts[i]))
            #time_int = time.mktime(time.strptime(str(cts[i]),'%Y-%m-%d %H:%M:%S'))
            #time_obj = time.localtime(time_int)
            #cts[i] = time.strftime("%Y-%m-%d_%H-%M-%S",time_obj)
            Pro = Process(target=download_img,args=(url,i,result_path,cts[i],access_fail_c))
            logger.info("child_process {} started... ".format(i))
            Pro.start()
            process_list.append(Pro)
            i = i + 1 
    for p in process_list:
            p.join()
if __name__ == '__main__':
      logger = logging.getLogger()
      logger.setLevel(logging.DEBUG)
      handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + "/dl_images.log")
      formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
      handler.setFormatter(formatter)
      logger.addHandler(handler)
      start_process()
复制代码

 

posted @   苦逼yw  阅读(6)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)
点击右上角即可分享
微信分享提示