根据PV统计出前三的热门板块,并统计出热门板块下的用户数--方式二
测试数据
java代码
1 package com.hzf.spark.study; 2 3 import java.util.ArrayList; 4 import java.util.Collections; 5 import java.util.Comparator; 6 import java.util.HashMap; 7 import java.util.Iterator; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 12 import org.apache.spark.SparkConf; 13 import org.apache.spark.api.java.JavaPairRDD; 14 import org.apache.spark.api.java.JavaRDD; 15 import org.apache.spark.api.java.JavaSparkContext; 16 import org.apache.spark.api.java.function.Function; 17 import org.apache.spark.api.java.function.PairFlatMapFunction; 18 import org.apache.spark.api.java.function.PairFunction; 19 import org.apache.spark.api.java.function.VoidFunction; 20 import org.apache.spark.broadcast.Broadcast; 21 22 import scala.Tuple2; 23 24 public class HotChannel02 { 25 public static void main(String[] args) { 26 SparkConf conf = new SparkConf() 27 .setAppName("HotChannel") 28 .setMaster("local") 29 .set("spark.testing.memory", "2147480000"); 30 JavaSparkContext sc = new JavaSparkContext(conf); 31 JavaRDD<String> logRDD = sc.textFile("f:/userLog"); 32 String str = "View"; 33 final Broadcast<String> broadcast = sc.broadcast(str); 34 hotChannel(sc, logRDD, broadcast); 35 } 36 private static void hotChannel(JavaSparkContext sc, JavaRDD<String> logRDD, final Broadcast<String> broadcast) { 37 JavaRDD<String> filteredLogRDD = logRDD.filter(new Function<String, Boolean>() { 38 39 private static final long serialVersionUID = 1L; 40 41 @Override 42 public Boolean call(String v1) throws Exception { 43 String actionParam = broadcast.value(); 44 String action = v1.split("\t")[5]; 45 return actionParam.equals(action); 46 } 47 }); 48 49 JavaPairRDD<String, String> channel2nullRDD = filteredLogRDD.mapToPair(new PairFunction<String, String,String>() { 50 51 private static final long serialVersionUID = 1L; 52 53 @Override 54 public Tuple2<String, String> call(String val) throws Exception { 55 String channel = val.split("\t")[4]; 56 57 return new Tuple2<String, String>(channel,null); 58 } 59 }); 60 Map<String, Object> channelPVMap = channel2nullRDD.countByKey(); 61 Set<String> keySet = channelPVMap.keySet(); 62 List<SortObj> channels = new ArrayList<>(); 63 for(String channel : keySet){ 64 channels.add(new SortObj(channel, Integer.valueOf(channelPVMap.get(channel)+""))); 65 } 66 Collections.sort(channels, new Comparator<SortObj>() { 67 68 @Override 69 public int compare(SortObj o1, SortObj o2) { 70 return o2.getValue() - o1.getValue(); 71 } 72 }); 73 74 List<String> hotChannelList = new ArrayList<>(); 75 for (int i = 0; i < 3; i++) { 76 hotChannelList.add(channels.get(i).getKey()); 77 } 78 79 80 final Broadcast<List<String>> hotChannelListBroadcast = sc.broadcast(hotChannelList); 81 82 83 JavaRDD<String> filtedRDD = logRDD.filter(new Function<String, Boolean>() { 84 85 @Override 86 public Boolean call(String v1) throws Exception { 87 List<String> hostChannels = hotChannelListBroadcast.value(); 88 String channel = v1.split("\t")[4]; 89 String userId = v1.split("\t")[2]; 90 return hostChannels.contains(channel) && !"null".equals(userId); 91 } 92 }); 93 94 JavaPairRDD<String, String> user2ChannelRDD = filtedRDD.mapToPair(new PairFunction<String, String,String>() { 95 96 private static final long serialVersionUID = 1L; 97 98 @Override 99 public Tuple2<String, String> call(String val) throws Exception { 100 String[] splited = val.split("\t"); 101 String userId = splited[2]; 102 String channel = splited[4]; 103 return new Tuple2<String, String>(userId,channel); 104 } 105 }); 106 107 JavaPairRDD<String, String> userVistChannelsRDD = user2ChannelRDD.groupByKey().flatMapToPair(new PairFlatMapFunction<Tuple2<String,Iterable<String>>, String, String>() { 108 109 private static final long serialVersionUID = 1L; 110 111 @Override 112 public Iterable<Tuple2<String, String>> call(Tuple2<String, Iterable<String>> tuple) throws Exception { 113 String userId = tuple._1; 114 Iterator<String> iterator = tuple._2.iterator(); 115 Map<String, Integer> channelMap = new HashMap<>(); 116 while (iterator.hasNext()) { 117 String channel = iterator.next(); 118 Integer count = channelMap.get(channel); 119 if(count == null) 120 count = 1; 121 else 122 count++; 123 channelMap.put(channel, count); 124 } 125 126 List<Tuple2<String, String>> list = new ArrayList<>(); 127 Set<String> keys = channelMap.keySet(); 128 for(String channel : keys){ 129 Integer channelNum = channelMap.get(channel); 130 list.add(new Tuple2<String, String>(channel, userId + "_" + channelNum)); 131 } 132 return list; 133 } 134 }); 135 136 137 userVistChannelsRDD.groupByKey().foreach(new VoidFunction<Tuple2<String,Iterable<String>>>() { 138 139 private static final long serialVersionUID = 1L; 140 141 @Override 142 public void call(Tuple2<String, Iterable<String>> tuple) throws Exception { 143 String channel = tuple._1; 144 Iterator<String> iterator = tuple._2.iterator(); 145 List<SortObj> list = new ArrayList<>(); 146 while (iterator.hasNext()) { 147 String ucs = iterator.next(); 148 String[] splited = ucs.split("_"); 149 String userId = splited[0]; 150 Integer num = Integer.valueOf(splited[1]); 151 list.add(new SortObj(userId, num)); 152 } 153 154 Collections.sort(list,new Comparator<SortObj>() { 155 156 @Override 157 public int compare(SortObj o1, SortObj o2) { 158 return o2.getValue() - o1.getValue(); 159 } 160 }); 161 162 System.out.println("HOT_CHANNLE:"+channel); 163 for(int i = 0 ; i < 3 ; i++){ 164 SortObj sortObj = list.get(i); 165 System.out.println(sortObj.getKey() + "===" + sortObj.getValue()); 166 } 167 } 168 }); 169 } 170 }
result