应用Flume+HBase采集和存储日志数据
1. 在本方案中,我们要将数据存储到HBase中,所以使用flume中提供的hbase sink,同时,为了清洗转换日志数据,我们实现自己的AsyncHbaseEventSerializer。
package com.ncc.dlut; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.conf.ComponentConfiguration; import org.apache.flume.sink.hbase.AsyncHbaseEventSerializer; import org.apache.flume.sink.hbase.SimpleRowKeyGenerator; import org.hbase.async.AtomicIncrementRequest; import org.hbase.async.PutRequest; public class AsyncHbaseLTEEventSerializer implements AsyncHbaseEventSerializer { //表名 private byte[] table; //列族 private byte[] colFam; //当前事件 private Event currentEvent; //列名 private byte[][] columnNames; //用于向HBase批量存储数据 private final List<PutRequest> puts = new ArrayList<PutRequest>(); private final List<AtomicIncrementRequest> incs = new ArrayList<AtomicIncrementRequest>(); //当前行键 private byte[] currentRowKey; private final byte[] eventCountCol = "eventCount".getBytes(); @Override public void configure(Context context) { //从配置文件中获取列名 String cols = new String(context.getString("columns")); String[] names = cols.split(","); columnNames = new byte[names.length][]; int i = 0; for(String name:names){ columnNames[i++] = name.getBytes(); } } @Override public void configure(ComponentConfiguration conf) { // TODO Auto-generated method stub } @Override public void cleanUp() { // TODO Auto-generated method stub table = null; colFam = null; currentEvent = null; columnNames = null; currentRowKey = null; } @Override public List<PutRequest> getActions() { // 分割事件体获取各列的值 String eventStr = new String(currentEvent.getBody()); String[] cols = logTokenize(eventStr); puts.clear(); //数据中的时间 String time=cols[1]; int n1 = 13-time.length(); StringBuilder sb = new StringBuilder(time); for(int i=0;i<n1;i++){ sb.insert(0, '0'); } try { //使用自带的行键生成器生成行键 currentRowKey = SimpleRowKeyGenerator.getUUIDKey(cols[0]+"-"+sb.toString()); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } // currentRowKey = (cols[0]+"-"+System.currentTimeMillis()).getBytes(); int n = cols.length; // 添加每列数据 for(int i=0;i<n;i++){ PutRequest putReq = new PutRequest(table, currentRowKey,colFam,columnNames[i],cols[i].getBytes()); puts.add(putReq); } return puts; } @Override public List<AtomicIncrementRequest> getIncrements() { // 增加接收到的事件数量 incs.clear(); incs.add(new AtomicIncrementRequest(table, "totalEvents".getBytes(), colFam, eventCountCol)); return incs; } @Override //初始化表名和列名 public void initialize(byte[] table, byte[] cf) { this.table = table; this.colFam = cf; } @Override public void setEvent(Event event) { // TODO Auto-generated method stub this.currentEvent = event; } //从日志中获取列值信息 public String[] logTokenize(String eventStr) { // String logEntryPattern = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+|-) \"([^\"]+)\" \"([^\"]+)\""; // Pattern p = Pattern.compile(logEntryPattern); // Matcher matcher = p.matcher(eventStr); /* if (!matcher.matches()){ System.err.println("Bad log entry (or problem with RE?):"); System.err.println(eventStr); return null; } */ /* String[] columns = new String[matcher.groupCount()]; for (int i = 0; i < matcher.groupCount(); i++){ columns[i] = matcher.group(i+1); }*/ String[] s = eventStr.split("[:,]"); int n = s.length; String[] columns = new String[n/2]; for(int i=0;2*i+1<n;i++){ columns[i] = s[2*i+1]; } return columns; } }
所需jar包如下:
这些jar包都可以在flume的lib文件夹中找到。
2. 将上面的程序打包,放入flume的lib文件夹中
3. 配置Flume,实现采集和存储
配置文件flume-hbase.properties如下:
############################################ # flume-src-agent config ########################################### #agent section agent.sources = s agent.channels = c agent.sinks = r #source section #agent.sources.s.type = exec #agent.sources.s.command = tail -f -n+1 /usr/local/test.log agent.sources.s.type = spooldir agent.sources.s.spoolDir = /usr/local/flume-hbase agent.sources.s.fileHeader = true agent.sources.s.batchSize = 100 agent.sources.s.channels = c # Each sink's type must be defined agent.sinks.r.type = asynchbase agent.sinks.r.table = car_table agent.sinks.r.columnFamily = lte agent.sinks.r.batchSize = 100 agent.sinks.r.serializer = com.ncc.dlut.AsyncHbaseLTEEventSerializer agent.sinks.r.serializer.columns = cid,time,pci,st,ed,ta,lng,lat #Specify the channel the sink should use agent.sinks.r.channel = c # Each channel's type is defined. agent.channels.c.type = memory agent.channels.c.capacity = 1000
参考链接: