文件合并
#!/usr/bin/python #Filename :smelter_colmerge.py # -*- coding:utf-8 -*- import cx_Oracle import sys import getopt,string import re import os import datetime import os.path import configuration from Common import * class kk(object): def __init__(self): pass def conn_db(self): try: job_config=configuration.configuration() user_name=job_config.loadConfigFile('config_test.xml','user_name') user_password=job_config.loadConfigFile('config_test.xml','user_password') dns=job_config.loadConfigFile('config_test.xml','dns') db=cx_Oracle.connect(user_name,user_password,dns) return db except Exception as e: print (-1,e) def c_process_id(self,pid): db=self.conn_db() cursor=db.cursor() pro_pid=cursor.var(cx_Oracle.NUMBER) cursor.callproc("common.process_id",[pid,pro_pid]) return pro_pid.getvalue() db.close() def prhelp(self): print ''' Usage: smelter_colmerge.py [options] Options: -h, --help show this help message and exit -i, --interid input file_id with a comma segmentation,can more values input -n, --processid input process id -o, --targetdir input targetdir -b, --before assign to type code -t, --tableid assign to table id Example: python smelter_colmerge.py -n 100008111 -o /hadoop/pigtmp/yu -i 'I10173227','I10173228' or python smelter_colmerge.py -n 100008111 -o /hadoop/pigtmp/yu -b FILE_COLLECT -t 26 ''' #Parameters processing def fun_param_input(self): TYPCD=0 INTERID=0 try: if len(sys.argv) > 6: opts, args = getopt.getopt(sys.argv[1:],"i:n:o:b:t:h", ["interid=","processid=","targetdir=","before=","tableid=","help"]) #print(sys.argv) #print(opts) #print(args) for a,o in opts: if a in ('-i', '--interid'): INTERID=o #print INTERID elif a in ('-n','--processid'): PROCESSID=o #print PROCESSID elif a in ('-o','--targetdir'): targetdir=o #print targetdir elif a in ('-b','--before'): TYPCD=o #print o elif a in ('-t','--tableid'): table_id=o #print o elif a in ('-h', '--help'): self.prhelp() sys.exit() if os.path.exists(targetdir): if INTERID==0: #print PROCESSID,targetdir,TYPCD,0,table_id return PROCESSID,targetdir,TYPCD,0,table_id else: dd=INTERID.split(',') #print '###################' #print dd #print '###################' #dd=re.split(',',INTERID) #print PROCESSID,targetdir,0,dd,0 return PROCESSID,targetdir,0,dd,0 else: meg='%s not exists'%target_directory print (-1,meg) else: print 'Please input right parameters' self.prhelp() sys.exit() except Exception as e: return (-1,e) # find file def fun_find_file(self): PROCESSID,targetdir,bftype,f_list,table_id=self.fun_param_input() #print f_list process_id=self.c_process_id(PROCESSID) db=self.conn_db() cursor=db.cursor() dm_t=myexec_type() tp_id=dm_t.code_to_id('FILE_MERGE') if bftype==0: i=0 j=0 source_file=[] if len(f_list)<=1: meg='please input two or more' print (-1,meg) return 0,0,0,0,0,0 else: mg=[] while i< len(f_list): #print tp_id cursor.execute('select distinct if_file_name,table_id,file_id,file_type,current_directory,data_date,hour_id from log_file where file_id=:fl_id and status=\'S\'and type_id <>:tp_id',fl_id=f_list[i],tp_id=tp_id) a=cursor.fetchall() if len(a)==1: while j <= i: source_file.append(a[0]) j+=1 else: meg='%s is wrong parameter'%f_list[i] mg.append(meg) i+=1 if len(mg) > 1: print (-1,mg) if len(source_file)==len(f_list): #print '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' #print i,source_file,tp_id,targetdir,process_id,source_file[0][1] return i,source_file,tp_id,targetdir,process_id,source_file[0][1] else: return 0,0,0,0,0,0 else: m=0 source_file=[] bf_id =dm_t.code_to_id(bftype) #print bf_id cursor.execute('select max(process_id) from log_file where type_id=:tpp_id and table_id=:tbb_id and status=\'S\'',tpp_id=bf_id,tbb_id=table_id) c=cursor.fetchall() pcsid=c[0][0] cursor.execute('select distinct if_file_name, table_id,file_id,file_type,current_directory,data_date,hour_id from log_file where status=\'S\' and process_id=:pdd and table_id=:tbb_id and type_id <>:tp_id',pdd=pcsid,tbb_id=table_id,tp_id=tp_id) a=cursor.fetchall() if len(a)>=2: while m<len(a): source_file.append(a[m]) m+=1 #print "############################################################" #print m,source_file,tp_id,targetdir,process_id,table_id return m,source_file,tp_id,targetdir,process_id,table_id else: meg='File number less than two' print (-1,meg) return 0,0,0,0,0,0 db.close() # search if exit the merge file def fun_targetdir(self): f_num,source_file,tp_id,targetdir,process_id,table_id=self.fun_find_file() if f_num == 0: return 0,0,0,0,0,0 else: newname = 'merge'+source_file[0][0] targetname = os.path.join(targetdir,newname) if os.path.exists(targetname): meg='%s already exits'%targetname print (-1,meg) return 0,0,0,0,0,0 else: #print f_num,source_file,tp_id,targetdir,process_id,table_id return f_num,source_file,tp_id,targetdir,process_id,table_id # Judgment field is consistent def fun_colnum(self): try: f_num,source_file,tp_id,targetdir,process_id,table_id=self.fun_targetdir() db=self.conn_db() cursor=db.cursor() dm_t=myexec_type() vf_id=dm_t.code_to_id('IF_FILE_VERIFY_MASK') cursor.execute('select fields_terminated_by from dm_if_table where table_id=:tbl_id',tbl_id=table_id) d=cursor.fetchall() f_termin=d[0][0] f_termin=f_termin.replace('\'',"") f_termin=f_termin.replace("\"",'') cursor.execute('select max(position_end),max(column_seq) from (select position_start,column_seq,position_end from dm_if_column where table_id=:tbl_id order by 2)',tbl_id=table_id) c=cursor.fetchall() pos_end=c[0][0] column_seq=c[0][1] cursor.execute('select fields_terminated_by from dm_if_filemask where table_id=:tbl_id and type_id=:ty_id',tbl_id=table_id,ty_id=vf_id) d=cursor.fetchall() v_termin=d[0][0] v_termin=v_termin.replace('\'',"") v_termin=v_termin.replace("\"",'') #print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" #print vf_id #print f_termin,pos_end,column_seq #print v_termin if f_num==0: pass else: w=0 i_num=0 v_num=0 i_file=[] v_file=[] while w < f_num: if source_file[w][2].startswith('I'): i_file.append(source_file[w]) i_num+=1 else: v_file.append(source_file[w]) v_num+=1 w+=1 z=0 meg=0 m=900 h=200 j=100 f_llth=[] v_llth=[] mgg=[] while z < f_num: s_name =os.path.join(source_file[z][4],source_file[z][0]) cursor.callproc("common.Log_file_initial",[process_id,source_file[z][1],source_file[z][2] ,tp_id,source_file[z][3]]) cursor.callproc("common.Log_file_start",[process_id,source_file[z][1],source_file[z][2],tp_id,source_file[z][3],source_file[z][5],source_file[z][6],source_file[z][0],source_file[z][4]]) try: ff=open(s_name,'r') except Exception as e: print (-1,'%s open error %s'%(s_name,e)) line=ff.readline() line=line.strip('\n') if source_file[z] in i_file: if pos_end > 1: #print "?????????????????????????????????" #print len(line) #print pos_end #print line if len(line)==pos_end: m+=1 else: meg='%s number of field not consistent' %source_file[z][2] mgg.append(meg) else: f_ll=line.split(f_termin) fl_num=len(f_ll) f_llth.append(fl_num) if max(f_llth)<>min(f_llth): print'9999999999999999999' cursor.callproc("common.Log_file_fail",[process_id,source_file[z][1],source_file[z][2],tp_id,source_file[z][3],meg,0,0]) meg='%s number of field not consistent' %source_file[z][2] mgg.append(meg) else: h+=1 #print h ff.close() if source_file[z] in v_file: v_ll=line.split(v_termin) vl_num=len(v_ll) v_llth.append(vl_num) if max(v_llth)<>min(v_llth): cursor.callproc("common.Log_file_fail",[process_id,source_file[z][1],source_file[z][2],tp_id,source_file[z][3],meg,0,0]) meg='The number of field not consistent' print (-1,meg) else: j+=1 #print j ff.close() z+=1 if len(mgg)>1: print (-1,mgg) if meg==0: #print f_num,i_file,v_file,source_file,tp_id,targetdir,process_id,table_id,v_termin return f_num,i_file,v_file,source_file,tp_id,targetdir,process_id,table_id,v_termin else: return 0,0,0,0,0,0,0,0 db.close() except Exception as e: return (-1,e) #merge files def fun_merge(self): try: db=self.conn_db() cursor=db.cursor() readsize = 1024 k=0 z=0 f_num,i_file,v_file,source_file,tp_id,targetdir,process_id,table_id,v_termin=self.fun_colnum() #print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" #print f_num,i_file,v_file,source_file,tp_id,targetdir,process_id,table_id if f_num==0: pass else: if len(i_file)==0: pass else: newname1 = 'merge'+i_file[0][0] targetname1=os.path.join(targetdir,newname1) t_file1=open(targetname1,'wb') while z< len(i_file): if i_file[z] in i_file: f_name =os.path.join(i_file[z][4],i_file[z][0]) f_file=open(f_name,'rb') while 1: filebytes = f_file.read(readsize) if not filebytes: break else: t_file1.write(filebytes) t_file1.write(os.linesep) cursor.callproc("common.Log_file_success",[process_id,i_file[z][1],i_file[z][2],tp_id,i_file[z][3],0]) f_file.close() z+=1 t_file1.close if len(v_file)==0: pass else: newname2 = 'merge'+v_file[0][0] targetname2=os.path.join(targetdir,newname2) t_file2=open(targetname2,'wb') a=0 b=0 f_min=0 lst_f=[] file_line=[] cursor.execute('select column_seq from DM_IF_VERIFY_FILE where table_id=:tbl_id and if_filename=1',tbl_id=table_id) c=cursor.fetchall() column_seq=c[0][0] while k < len(v_file): v_name =os.path.join(v_file[k][4],v_file[k][0]) v_f=open(v_name,'rb') line=v_f.readline() line=line.strip('\n') ll=line.split(v_termin) file_line.append(ll) cursor.callproc("common.Log_file_success",[process_id,v_file[k][1],v_file[k][2],tp_id,v_file[k][3],0]) v_f.close() k+=1 #print '9999999999999999999999999999999999999999999' #print file_line while a<len(ll): b=0 f_min=0 while b<len(v_file): if a==column_seq-1: f_min=newname2 else: file_line[b][a]=int(file_line[b][a]) f_min=f_min+file_line[b][a] b=b+1 lst_f.append(f_min) a=a+1 #print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!" #print lst_f #print "$$$$$$$$$$$$$$$$$$$$$$$$$$" q=0 while q<len(lst_f): lst_f[q]=str(lst_f[q]) q+=1 #print lst_f fstring="|".join(lst_f) #print fstring t_file2.write(fstring) t_file2.close() #print 'dddddddddddddddddddddddddddd' #print k #print z if k+z==f_num: print (0,f_num) db.close() except Exception as e: return (-1,e) if __name__ == '__main__': a=kk() a.fun_merge()