python中使用pyspark 读取和整理日志数据并将数据写入到es中去
代码如下
import re import datetime from pyspark.sql import SparkSession from pyspark import SparkContext from elasticsearch import Elasticsearch spark=SparkSession.builder.appName("lz").getOrCreate() sc = SparkContext.getOrCreate() es = Elasticsearch() month_map = {'Jan': '1', 'Feb': '2', 'Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6', 'Jul':'7', 'Aug':'8', 'Sep': '9', 'Oct':'10', 'Nov': '11', 'Dec': '12'} log_data = sc.textFile("/Desktop/data_doc/data_Log/sshlogin/03.txt") #使用spark读取本地日志文件 for b in log_data.toLocalIterator(): #以迭代的方式来把一条条数据读取出来进行正则匹配,并最终将 dict作为body写入到es中去 # e='Ambari:Mar 2 02:14:16 ambari sshd[16716]: Accepted password for root from 172.21.202.174 port 59886 ssh2'#日志格式 log_group=re.search('^(\S+):(\w{3})\s+(\d{1,2})\s(\d{2}:\d{2}:\d{2})\s(\S+)\s(\S+)\[(\d+)\]:\s(.+)',b) if log_group: year='2019' try: logtime = year+'-'+month_map[log_group.group(2)]+'-'+log_group.group(3)+' '+log_group.group(4) #将字段拼接成年月日的格式 logtime = datetime.datetime.strptime(logtime,'%Y-%m-%d %H:%M:%S') except Exception as e: pass row = dict(_hostname=log_group.group(1), #将数据组成一个字典 k,v syslog_timestamp=logtime, hostname=log_group.group(5), program=log_group.group(6), pid=log_group.group(7), msg = log_group.group(8)) if re.match('^Accepted password for',row['msg']) or re.match('^Accepted publickey for',row['msg']) : msg_a=re.search('Accepted\s\w+\sfor\s(\S+)\sfrom\s(\d{2,3}\.\d{2,3}\.\d{2,3}\.\d{2,3})\sport\s(\d+)',row['msg']) row['login_success']=True row['login_success_msg']={'username':msg_a.group(1),'user_ip':msg_a.group(2),'user_port':msg_a.group(3)} es.index(index='data_log02',doc_type='test02',body=row) #将数据写入到es中去 else: break
另外一种log的处理
import datetime from pyspark import SparkContext from elasticsearch import Elasticsearch sc = SparkContext.getOrCreate() log_data = sc.textFile("/Desktop/data_doc/data_Log/utm/GX04-UTM1000D-1") """ 一条日志的格式如下 Mar 1 00:00:08 172.21.208.21 date=2019-03-01 time=00:00:08 devname=GX04-UTM1000D-1 devid=FGT1KD3914800909 logid=0001000014 type=traffic subtype=local level=notice vd=root srcip=195.142.115.111 srcport=54045 srcintf="port12" dstip=114.242.119.194 dstport=80 dstintf="root" sessionid=1013402601 status=deny policyid=0 dstcountry="China" srccountry="Turkey" trandisp=noop service=FortiGuard proto=6 app="Web Management" duration=0 sentbyte=0 rcvdbyte=0 sentpkt=0 """ es = Elasticsearch() # 打印加载的用户信息第一条 fileds = log_data.map(lambda lines:lines.split()) #将数据按照空格来切割 print(fileds.first()) ''' 操作之后的数据格式 ['Mar', '1', '00:00:06', '172.21.208.21', 'date=2019-03-01', 'time=00:00:06', 'devname=GX04-UTM1000D-1', 'devid=FGT1KD3914800909', 'logid=0001000014', 'type=traffic', 'subtype=local', 'level=notice', 'vd=root', 'srcip=89.248.172.38', 'srcport=40462', 'srcintf="port12"', 'dstip=114.242.119.252', 'dstport=55325', 'dstintf="root"', 'sessionid=1013402572', 'status=deny', 'policyid=0', 'dstcountry="China"', 'srccountry="Netherlands"', 'trandisp=noop', 'service=55325/tcp', 'proto=6', 'app=55325/tcp', 'duration=0', 'sentbyte=0', 'rcvdbyte=0', 'sentpkt=0'] ''' start = datetime.datetime.now() #初级版本,用于处理utm的初级版本,直接读取健值对,然后将数据写入到es中去 for b in fileds.toLocalIterator(): d = {} for i in b[4:]: j=i.split("=") if len(j)==2: k,v=j[0].strip(),j[1].strip('"') if k =="date": v = datetime.datetime.strptime(v,"%Y-%m-%d") if k =="time": v = datetime.datetime.strptime(v,"%H:%M:%S") d[k]=v es.index(index='data_log01',doc_type='test01',body=d) #将整理好的 k,v格式的数据作为body写入es库 end = datetime.datetime.now() print(end-start,'这是时间')