python多进程-进程池模式退出异常解决办法
在执行python脚本的多进程程序时,会随机性的触发程序执行完,无法主动退出的情况。尤其是当进程数足够大时,处理的数据量足够多时。出现退出异常的概率越大。下面的脚本用于解决退出异常问题。
import argparse import requests import getpass from multiprocessing import Pool import datetime import time from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) def get_parameter(): parser=argparse.ArgumentParser(description='利用requests库批量访问url(v3)') parser.add_argument('-f',dest='inputFile',type=str,default='',help='url文件') parser.add_argument('-o',dest='outputFile',type=str,default='result.txt',help='结果保存文件') parser.add_argument('-p',dest='proxyServer',type=str,default='',help='设置代理服务器,默认不指定') parser.add_argument('-n',dest='processNum',type=int,default='1',help='指明脚本进程数,缺省为1') args=parser.parse_args() inputFile=args.inputFile outputFile=args.outputFile proxyServer=args.proxyServer processNum=args.processNum return inputFile,outputFile,proxyServer,processNum def set_proxy(urlProxy): if not urlProxy: proxies={} else: username=input('username:') password=getpass.getpass('password:') http_proxy='http://'+str(username)+':'+str(password)+'@'+str(urlProxy) https_proxy='https://'+str(username)+':'+str(password)+'@'+str(urlProxy) proxies={ 'http':http_proxy, 'https':https_proxy } return proxies def get_url(urlFile): with open(urlFile,'r') as urlFile: allUrl=urlFile.readlines() return allUrl def http_request(url,proxy=''): headers={ 'User-Agent':'curl/3.03', 'Connection':'close' # keep-alive } try: r=requests.get(url,headers=headers,proxies=proxy,timeout=15,verify=False) urlresult=url+'\t'+str(r.status_code) except Exception as e: urlresult=url+'\t'+str(e) finally: print(urlresult) return urlresult def main(): start_time=datetime.datetime.now() inputFile, outputFile, proxyServer, processNum=get_parameter() allUrl=get_url(inputFile) proxies=set_proxy(proxyServer) p=Pool(processNum) print('总URL数量:{}'.format(len(allUrl))) def writer_log(urlresult): with open(outputFile,'a+') as wf: wf.write(urlresult+'\n') # with open(outputFile,'w+') as wf: for i in allUrl: url=i.split()[-1] result=p.apply_async(http_request,args=(url,proxies),callback=writer_log) p.close() count=0 while True: try: time.sleep(60) if result.ready(): count+=1 time.sleep(180) result.get() if count>4: break except Exception as e: print('进程异常:{}'.format(str(e))) p.terminate() p.join() end_time=datetime.datetime.now() print('开始时间:{}'.format(start_time)) print('结束时间:{}'.format(end_time)) print('总耗时:{}'.format(end_time-start_time)) print('结果保存在:{}'.format(outputFile)) if __name__=='__main__': main()