python分析日志脚本
1 #!/usr/bin/env python 2 # coding:utf-8 3 4 import sys,time 5 6 class DisplayFormat(object): 7 8 def format_size(self,size): 9 KB = 1024 # KB -> B 1024 10 MB = 1048576 # MB -> B 1024 * 1024 11 GB = 1073741824 # GB -> B 1024 * 1024 * 1024 12 TB = 1099511627776 # TB -> B 1024 * 1024 * 1024 13 14 if size >= TB: 15 size = str(size >> 40) + 'T' 16 elif size < KB: 17 size = str(size) + 'B' 18 elif size >= GB and size < TB: 19 size = str(size >> 30) + 'G' 20 elif size >= MB and size < GB: 21 size = str(size >> 20) + 'M' 22 else: 23 size = str(size >> 10) + 'K' 24 25 return size 26 27 formatstring = '%-18s %-10s %-12s %8s %10s %10s %10s %10s %10s %10s %10s' 28 29 def echo_line(self): 30 '''输出头部横线''' 31 print self.formatstring % ('-'*15,'-'*10,'-'*12,'-'*12,'-'*10,'-'*10,'-'*10,'-'*10,'-'*10,'-'*10,'-'*10,) 32 33 def echo_head(self): 34 '''输出头部信息''' 35 print self.formatstring % ('IP','Traffic','Time','Time%',200,404,403,503,500,302,304) 36 37 def echo_error(self): 38 '''输出错误信息''' 39 print 'Usage: ' + sys.argv[0] + 'filepath [number]' 40 41 def echo_time(self): 42 '''输出脚本执行时间''' 43 print 'The script is running %s second' % time.clock() 44 45 46 class HostInfo(object): 47 48 # 定义一个主机ip 的所有状态列表 49 host_info = ['200','404','403','503','500','302','304','size','time'] 50 51 def __init__(self,host): 52 '''初始化一个主机信息字典''' 53 self.host = host = {}.fromkeys(self.host_info,0) 54 55 def add_1(self,status_size,is_size): 56 '''对访问次数,http返回的状态码,ip流量进行加1操作''' 57 if status_size == 'time': 58 self.host['time'] += 1 59 elif is_size: 60 self.host['size'] = self.host['size'] + status_size 61 else: 62 self.host[status_size] += 1 63 64 def get_value(self,value): 65 '''取出字典的值''' 66 return self.host[value] 67 68 69 class AnalysisFile(object): 70 71 def __init__(self): 72 '''初始化一个空字典''' 73 self.empty = {} 74 self.total_request_time,self.total_traffic,self.total_200,\ 75 self.total_404,self.total_403,self.total_503,self.total_500,\ 76 self.total_302,self.total_304 = 0,0,0,0,0,0,0,0,0 77 78 def split_line_todict(self,line): 79 '''传入文件的每一行取出0、8、9字段 生成字典 并返回这个字典''' 80 line_split = line.split() 81 line_dict = {'remote_host':line_split[0],'status':line_split[8],'bytes_sent':line_split[9]} 82 return line_dict 83 84 def read_log(self,logs): 85 for line in logs: 86 try: 87 dict_line = self.split_line_todict(line) 88 host = dict_line['remote_host'] 89 status = dict_line['status'] 90 except ValueError: 91 continue 92 except IndexError: 93 continue 94 95 if host not in self.empty: 96 host_info_obj = HostInfo(host) 97 self.empty[host] = host_info_obj 98 else: 99 host_info_obj = self.empty[host] 100 101 host_info_obj.add_1('time',False) 102 103 if status in host_info_obj.host_info: 104 host_info_obj.add_1(status,False) 105 106 try: 107 bytes_sent = int(dict_line['bytes_sent']) 108 except ValueError: 109 bytes_sent = 0 110 111 host_info_obj.add_1(bytes_sent,True) 112 113 return self.empty 114 115 def return_sorted_list(self,true_dict): 116 '''循环读取字典,计算总的流量、总的访问次数以及总的http返回码''' 117 for host_key in true_dict: 118 host_value = true_dict[host_key] 119 time = host_value.get_value('time') 120 self.total_request_time = self.total_request_time + time 121 size = host_value.get_value('size') 122 self.total_traffic = self.total_traffic + size 123 124 # 获取http返回状态码的次数 125 v_200 = host_value.get_value('200') 126 v_404 = host_value.get_value('404') 127 v_403 = host_value.get_value('403') 128 v_503 = host_value.get_value('503') 129 v_500 = host_value.get_value('500') 130 v_302 = host_value.get_value('302') 131 v_304 = host_value.get_value('304') 132 133 # 重新规划字典 134 true_dict[host_key] = {'200':v_200,'404':v_404,'403':v_403,\ 135 '503':v_503,'500':v_500,'302':v_302,\ 136 '304':v_304,'size':size,'time':time} 137 138 139 # 计算http返回状态码的总量 140 self.total_200 = self.total_200 + v_200 141 self.total_404 = self.total_404 + v_404 142 self.total_403 = self.total_403 + v_403 143 self.total_503 = self.total_503 + v_503 144 self.total_500 = self.total_500 + v_500 145 self.total_302 = self.total_302 + v_302 146 self.total_304 = self.total_304 + v_304 147 148 # 对总的访问次数和访问流量进行降序排序,并生成一个有序的列表 149 sorted_list = sorted(true_dict.items(),key=lambda i:(i[1]['size'],\ 150 i[1]['time']),reverse=True) 151 152 return sorted_list 153 154 155 class Main(object): 156 157 def main(self): 158 '''主调函数''' 159 # 初始化DisplayFormat类的实例 160 displayformat = DisplayFormat() 161 162 args = len(sys.argv) 163 if args == 1: 164 displayformat.echo_error() 165 elif args == 2 or args == 3: 166 log_file = sys.argv[1] 167 try: 168 files = open(log_file,'r') 169 if args == 3: 170 lines = int(sys.argv[2]) 171 else: 172 lines = 0 173 except IOError,e: 174 print 175 print e 176 displayformat.echo_error() 177 except VaueError,e: 178 print 179 print e 180 displayformat.echo_error() 181 182 else: 183 displayformat.echo_error() 184 185 186 #AnalysisFile类的实例化 187 fileanalysis = AnalysisFile() 188 189 # 调用read_log方法 190 news_dict = fileanalysis.read_log(files) 191 192 # 调用return_sorted_list方法 193 new_list = fileanalysis.return_sorted_list(news_dict) 194 195 # 计算所有ip的总量 196 total_ip = len(new_list) 197 198 if lines: 199 new_list = new_list[0:lines] 200 files.close() 201 202 # 打印出总的ip数,总访问流量,总的访问次数 203 print 204 total_request_time = fileanalysis.total_request_time 205 total_traffic = displayformat.format_size(fileanalysis.total_traffic) 206 print '总IP数量: %s 总的访问流量: %s 总的请求次数: %d' % (total_ip,\ 207 total_traffic,\ 208 total_request_time) 209 210 # 打印头部信息,和横线 211 print 212 displayformat.echo_head() 213 displayformat.echo_line() 214 215 # 循环读取news_list列表取出time项目 计算time百分比 通过displayformat格式化输出主机信息 216 for i in new_list: 217 time = i[1]['time'] 218 time_percentage = (float(time) / float(fileanalysis.total_request_time)) * 100 219 print displayformat.formatstring % (i[0],\ 220 displayformat.format_size(i[1]['size']),\ 221 time,str(time_percentage)[0:5],\ 222 i[1]['200'],i[1]['404'],i[1]['403'],\ 223 i[1]['503'],i[1]['500'],i[1]['302'],i[1]['304']) 224 225 if not lines or total_ip == lines: 226 displayformat.echo_line() 227 print displayformat.formatstring % (total_ip,total_traffic,total_request_time,'100%',\ 228 fileanalysis.total_200,fileanalysis.total_404,\ 229 fileanalysis.total_403,fileanalysis.total_503,\ 230 fileanalysis.total_500,fileanalysis.total_302,\ 231 fileanalysis.total_304) 232 233 # 显示执行脚本的时间 234 print 235 displayformat.echo_time() 236 237 if __name__ == '__main__': 238 main = Main() 239 main.main()