flink-----实时项目---day04-------1. 案例:统计点击、参与某个活动的人数和次数 2. 活动指标多维度统计(自定义redisSink)
1. 案例
用户ID,活动ID,时间,事件类型,省份
u001,A1,2019-09-02 10:10:11,1,北京市
u001,A1,2019-09-02 14:10:11,1,北京市
u001,A1,2019-09-02 14:10:11,2,北京市
u002,A1,2019-09-02 14:10:11,1,北京市
u002,A2,2019-09-02 14:10:11,1,北京市
u002,A2,2019-09-02 15:10:11,1,北京市
u002,A2,2019-09-02 15:10:11,2,北京市
事件类型:
0:曝光
1:点击
2:参与
需求:统计点击、参与某个活动的人数和次数
- 方案一:使用ValueState结合HashSet实现
具体代码如下
ActivityCountAdv1
package cn._51doit.flink.day08; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.state.ValueState; import org.apache.flink.api.common.state.ValueStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple; import org.apache.flink.api.java.tuple.Tuple4; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.LocalStreamEnvironment; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.KeyedProcessFunction; import org.apache.flink.util.Collector; import java.util.HashSet; public class ActivityCountAdv1 { public static void main(String[] args) throws Exception { LocalStreamEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); DataStreamSource<String> lines = env.socketTextStream("feng05", 8888); // 对数据进行切分整理 SingleOutputStreamOperator<Tuple5<String, String, String, Integer, String>> tpDataStream = lines.map(new MapFunction<String, Tuple5<String, String, String, Integer, String>>() { @Override public Tuple5<String, String, String, Integer, String> map(String line) throws Exception { String[] fields = line.split(","); String uid = fields[0]; String activityID = fields[1]; String date = fields[2]; Integer type = Integer.parseInt(fields[3]); String prince = fields[4]; return Tuple5.of(uid, activityID, date, type, prince); } }); // 按照活动ID和事件类型分组 KeyedStream<Tuple5<String, String, String, Integer, String>, Tuple> keyed = tpDataStream.keyBy(1, 3); keyed.process(new KeyedProcessFunction<Tuple, Tuple5<String, String, String, Integer, String>, Tuple4<String, Integer, Integer, Integer>>() { //保存去重后用户ID的HashSet private transient ValueState<HashSet<String>> uidState; //保存次数的Integer类型 private transient ValueState<Integer> countState; @Override public void open(Configuration parameters) throws Exception { // 定义一个状态描述器 ValueStateDescriptor<HashSet<String>> stateDescriptor1 = new ValueStateDescriptor<HashSet<String>>( "uid-state", TypeInformation.of(new TypeHint<HashSet<String>>(){}) ); // 定义一个状态描述器 ValueStateDescriptor<Integer> stateDescriptor2 = new ValueStateDescriptor<Integer>( "count-state", Integer.class ); // 获取状态 uidState = getRuntimeContext().getState(stateDescriptor1); countState = getRuntimeContext().getState(stateDescriptor2); } @Override public void processElement(Tuple5<String, String, String, Integer, String> value, Context ctx, Collector<Tuple4<String, Integer, Integer, Integer>> out) throws Exception { String uid = value.f0; String aid = value.f1; Integer type = value.f3; //使用HashSet进行判断去重,更新uidState HashSet<String> hashSet = uidState.value(); if(hashSet == null){ hashSet = new HashSet<>(); } hashSet.add(uid); uidState.update(hashSet); // 计算人数 Integer count = countState.value(); if(count == null) { count = 0; } count += 1; countState.update(count); out.collect(Tuple4.of(aid,type,hashSet.size(), count)); } }).print(); env.execute(); } }
如果使用HashSet去重,用户实例较大,会大量消耗资源,导致性能变低,甚至内存溢出
- 方案二:改进,使用BloomFilter存储用户的ID,BloomFilter可以判断用户一定不存在,使用的内存极少。但是使用BloomFilter没有计数器,就必须额外定义一个状态,存储去重的人数
ActivityCountAdv2
package cn._51doit.flink.day08; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.state.ValueState; import org.apache.flink.api.common.state.ValueStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple; import org.apache.flink.api.java.tuple.Tuple4; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.configuration.Configuration; import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter; import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.KeyedProcessFunction; import org.apache.flink.util.Collector; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashSet; public class ActivityCountAdv2 { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //u001,A1,2019-09-02 10:10:11,1,北京市 DataStreamSource<String> lines = env.socketTextStream("localhost", 8888); //对数据进行切分整理 SingleOutputStreamOperator<Tuple5<String, String, String, String, String>> tpDataStream = lines.map(new MapFunction<String, Tuple5<String, String, String, String, String>>() { @Override public Tuple5<String, String, String, String, String> map(String line) throws Exception { String[] fields = line.split(","); String uid = fields[0]; String aid = fields[1]; String time = fields[2]; String type = fields[3]; String province = fields[4]; return Tuple5.of(uid, aid, time, type, province); } }); //按照活动ID和事件类型分组 KeyedStream<Tuple5<String, String, String, String, String>, Tuple> keyed = tpDataStream.keyBy(1, 3); keyed.process(new KeyedProcessFunction<Tuple, Tuple5<String, String, String, String, String>, Tuple4<String, String, Integer, Integer>>() { //保存去重后用户ID的HashSet private transient ValueState<BloomFilter> uidState; //保存用户ID去重的次数的Integer类型 private transient ValueState<Integer> uidCountState; //保存次数的Integer类型(未去重的) private transient ValueState<Integer> countState; @Override public void open(Configuration parameters) throws Exception { //定义一个状态描述器 ValueStateDescriptor<BloomFilter> stateDescriptor1 = new ValueStateDescriptor<BloomFilter>( "uid-state", TypeInformation.of(new TypeHint<BloomFilter>(){}) ); //定义一个状态描述器 ValueStateDescriptor<Integer> stateDescriptor2 = new ValueStateDescriptor<Integer>( "count-state", Integer.class ); //定义一个状态描述器 ValueStateDescriptor<Integer> stateDescriptor3 = new ValueStateDescriptor<Integer>( "uid-count-state", Integer.class ); //获取状态 //获取状态 uidState = getRuntimeContext().getState(stateDescriptor1); countState = getRuntimeContext().getState(stateDescriptor2); uidCountState = getRuntimeContext().getState(stateDescriptor3); } @Override public void processElement(Tuple5<String, String, String, String, String> value, Context ctx, Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception { String uid = value.f0; String aid = value.f1; String type = value.f3; //使用HashSet进行判断去重 BloomFilter bloomFilter = uidState.value(); Integer uidCount = uidCountState.value(); //人数 Integer count = countState.value(); //次数 if(count == null) { count = 0; } if(bloomFilter == null) { bloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 10000000); uidCount = 0; } if(!bloomFilter.mightContain(uid)) { bloomFilter.put(uid); //添加到BloomFilter中 uidCount += 1; } count += 1; countState.update(count); uidState.update(bloomFilter); uidCountState.update(uidCount); out.collect(Tuple4.of(aid, type, uidCount, count)); } }).print(); env.execute(); } }
2. 活动指标多维度统计
此处要进行多次key操作(一中维度就需要keyBy一次),相当繁琐。此处是通过将数据存入redis,所以不需要使用flink中的state,具体见代码
ActivityCountWithMultiDimension
package cn._51doit.flink.day08; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import sun.awt.geom.AreaOp; public class ActivityCountWithMultiDimension { public static void main(String[] args) throws Exception{ ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.getConfig().setGlobalJobParameters(parameters); //u001,A1,2019-09-02 10:10:11,1,北京市 DataStreamSource<String> lines = env.socketTextStream("localhost", 8888); SingleOutputStreamOperator<ActivityBean> beanStream = lines.map(new MapFunction<String, ActivityBean>() { @Override public ActivityBean map(String line) throws Exception { String[] fields = line.split(","); String uid = fields[0]; String aid = fields[1]; String date = fields[2].split(" ")[0]; String type = fields[3]; String province = fields[4]; return ActivityBean.of(uid, aid, date, type, province); } }); SingleOutputStreamOperator<ActivityBean> res1 = beanStream.keyBy("aid", "type").sum("count"); SingleOutputStreamOperator<ActivityBean> res2 = beanStream.keyBy("aid", "type", "date").sum("count"); SingleOutputStreamOperator<ActivityBean> res3 = beanStream.keyBy("aid", "type", "date", "province").sum("count"); res1.map(new MapFunction<ActivityBean, Tuple3<String, String, String>>() { @Override public Tuple3<String, String, String> map(ActivityBean value) throws Exception { return Tuple3.of(Constant.ACTIVITY_COUNT +"-"+ value.aid, value.type, value.count.toString()); } }).addSink(new MyRedisSink()); res2.map(new MapFunction<ActivityBean, Tuple3<String, String, String>>() { @Override public Tuple3<String, String, String> map(ActivityBean value) throws Exception { return Tuple3.of(Constant.DAILY_ACTIVITY_COUNT + "-" + value.aid + "-" + value.date, value.type, value.count.toString()); } }).addSink(new MyRedisSink()); res3.map(new MapFunction<ActivityBean, Tuple3<String, String, String>>() { @Override public Tuple3<String, String, String> map(ActivityBean value) throws Exception { return Tuple3.of(Constant.PROVINCE_DAILY_ACTIVITY_COUNT + "-" + value.aid + "-" + value.date + "-" + value.province, value.type, value.count.toString()); } }).addSink(new MyRedisSink()); env.execute(); } }
Constant
package cn._51doit.flink.day08; public class Constant { public static final String ACTIVITY_COUNT = "ACTIVITY_COUNT"; public static final String DAILY_ACTIVITY_COUNT = "DAILY_ACTIVITY_COUNT"; public static final String PROVINCE_DAILY_ACTIVITY_COUNT = "PROVINCE_DAILY_ACTIVITY_COUNT"; }
ActivityBean
package cn._51doit.flink.day08; public class ActivityBean { public String uid; public String aid; public String date; public String type; public String province; public Long count = 1L; public ActivityBean() {} public ActivityBean(String uid, String aid, String date, String type, String province) { this.uid = uid; this.aid = aid; this.date = date; this.type = type; this.province = province; } public static ActivityBean of(String uid, String aid, String date, String type, String province) { return new ActivityBean(uid, aid, date, type, province); } @Override public String toString() { return "ActivityBean{" + "uid='" + uid + '\'' + ", aid='" + aid + '\'' + ", date='" + date + '\'' + ", type='" + type + '\'' + ", province='" + province + '\'' + '}'; } }
MyRedisSink
package cn._51doit.flink.day08; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import redis.clients.jedis.Jedis; public class MyRedisSink extends RichSinkFunction<Tuple3<String, String, String>> { private transient Jedis jedis; @Override public void open(Configuration parameters) throws Exception { ParameterTool params = (ParameterTool) getRuntimeContext() .getExecutionConfig() .getGlobalJobParameters(); String host = params.getRequired("redis.host"); String password = params.getRequired("redis.password"); int port = params.getInt("redis.port", 6379); int db = params.getInt("redis.db", 0); Jedis jedis = new Jedis(host, port); jedis.auth(password); jedis.select(db); this.jedis = jedis; } @Override public void invoke(Tuple3<String, String, String> value, Context context) throws Exception { if (!jedis.isConnected()) { jedis.connect(); } jedis.hset(value.f0, value.f1, value.f2); } @Override public void close() throws Exception { jedis.close(); } }