用Spark完成复杂TopN计算的两种逻辑
如果有商品品类的数据pairRDD(categoryId,clickCount_orderCount_payCount),用Spark完成Top5,你会怎么做?
这里假设使用Java语言进行编写,那么你有两种思路:
1.简化成RDD(categoryObject),其中categoryObject实现了java.lang.Comparable.然后使用top(5)获得topN
2.转换成PairRDD(categoryKey,info),其中categoryKey实现了scala.math.Ordered。然后进行sortByKey之后再take(5).
注意:
1)top(n)函数在Java的Spark API中内部调用的比较器是java.lang.Comparable进行比较.
2)而sortByKey函数在Java的Spark API中依然调用scala.math.Ordered进行比较.
相比之下,思路2的空间和时间都不如思路1,但是如果我们需要sort结果的过程中顺便获得topN,则使用思路2更好一些。
思路1实现:
CategoryObject:
package com.stan.core.spark.userAction; import java.io.Serializable; public class ComparableCategoryObject implements Comparable<ComparableCategoryObject>, Serializable { String categoryId; Long clickCategoryCount; Long orderCategoryCount; Long defrayCategoryCount; @Override public int compareTo(ComparableCategoryObject o) { long compareNum = (this.defrayCategoryCount - o.defrayCategoryCount) * 10000 + (this.orderCategoryCount - o.orderCategoryCount) * 100 + (this.clickCategoryCount - o.clickCategoryCount) * 1; return (int)(compareNum%1000); } @Override public String toString() { return "ComparableCategoryObject{" + "categoryId='" + categoryId + '\'' + ", clickCategoryCount=" + clickCategoryCount + ", orderCategoryCount=" + orderCategoryCount + ", defrayCategoryCount=" + defrayCategoryCount + '}'; } public String getCategoryId() { return categoryId; } public void setCategoryId(String categoryId) { this.categoryId = categoryId; } public Long getClickCategoryCount() { return clickCategoryCount; } public void setClickCategoryCount(Long clickCategoryCount) { this.clickCategoryCount = clickCategoryCount; } public Long getOrderCategoryCount() { return orderCategoryCount; } public void setOrderCategoryCount(Long orderCategoryCount) { this.orderCategoryCount = orderCategoryCount; } public Long getDefrayCategoryCount() { return defrayCategoryCount; } public void setDefrayCategoryCount(Long defrayCategoryCount) { this.defrayCategoryCount = defrayCategoryCount; } }
具体调用方法:
// 1.封装 JavaRDD<ComparableCategoryObject> comparableCategoryObjectJavaRDD = categoryId2allCount.map( new Function<Tuple2<String, String>, ComparableCategoryObject>() { @Override public ComparableCategoryObject call(Tuple2<String, String> stringStringTuple2) throws Exception { String categoryId = stringStringTuple2._1; String allCount = stringStringTuple2._2; String[] tmpAllCountSplited = allCount.split("_"); Long clickCount = Long.valueOf(tmpAllCountSplited[0]); Long orderCount = Long.valueOf(tmpAllCountSplited[1]); Long defrayCount = Long.valueOf(tmpAllCountSplited[2]); ComparableCategoryObject comparableCategoryObject = new ComparableCategoryObject(); comparableCategoryObject.setCategoryId(categoryId); comparableCategoryObject.setClickCategoryCount(clickCount); comparableCategoryObject.setOrderCategoryCount(orderCount); comparableCategoryObject.setDefrayCategoryCount(defrayCount); return comparableCategoryObject; } } ); // 2.top(5) List<ComparableCategoryObject> top10Categorys = comparableCategoryObjectJavaRDD.top(5);
思路2实现:
CategoryKey:
package com.stan.core.spark.userAction; import scala.Serializable; import scala.math.Ordered; /** * 用于按照 * (clickCategoryCount,orderCategoryCount,defrayCategoryCount)的优先级排序 */ public class ComparableCategoryKey // scala中可比较,以便于进行RDD排序 implements Ordered<ComparableCategoryKey>, Serializable { String categoryId; Long clickCategoryCount; Long orderCategoryCount; Long defrayCategoryCount; /** * 计算比较数 * * 因为优先级为 : 先比较支付量,若支付量相同,则比较下单量,若下单量相同,则继续比较点击量 * 所以我在进行比较的时候直接使用 比较值 = 支付量差 * 10000 + 下单量差 * 100 + 点击量差 * 若比较值小于 0 ,则小于,若比较值大于0 , 则大于,若比较值等于0,则等于 * @param comparableCategoryWithAllCount * @return */ public long computeCompareNum(ComparableCategoryKey comparableCategoryWithAllCount){ long compareNum = (this.defrayCategoryCount - comparableCategoryWithAllCount.defrayCategoryCount) * 10000 + (this.orderCategoryCount - comparableCategoryWithAllCount.orderCategoryCount) * 100 + (this.clickCategoryCount - comparableCategoryWithAllCount.clickCategoryCount) * 1; return compareNum; } @Override public int compare(ComparableCategoryKey comparableCategoryWithAllCount) { return (int)(computeCompareNum(comparableCategoryWithAllCount)%1000); } @Override public boolean $less(ComparableCategoryKey comparableCategoryWithAllCount) { return computeCompareNum(comparableCategoryWithAllCount) < 0; } @Override public boolean $greater(ComparableCategoryKey comparableCategoryWithAllCount) { return computeCompareNum(comparableCategoryWithAllCount) > 0; } @Override public boolean $less$eq(ComparableCategoryKey comparableCategoryWithAllCount) { return computeCompareNum(comparableCategoryWithAllCount) <= 0; } @Override public boolean $greater$eq(ComparableCategoryKey comparableCategoryWithAllCount) { return computeCompareNum(comparableCategoryWithAllCount) >= 0; } @Override public int compareTo(ComparableCategoryKey comparableCategoryWithAllCount) { return (int)(computeCompareNum(comparableCategoryWithAllCount)%1000); } public String getCategoryId() { return categoryId; } public void setCategoryId(String categoryId) { this.categoryId = categoryId; } public Long getClickCategoryCount() { return clickCategoryCount; } public void setClickCategoryCount(Long clickCategoryCount) { this.clickCategoryCount = clickCategoryCount; } public Long getOrderCategoryCount() { return orderCategoryCount; } public void setOrderCategoryCount(Long orderCategoryCount) { this.orderCategoryCount = orderCategoryCount; } public Long getDefrayCategoryCount() { return defrayCategoryCount; } public void setDefrayCategoryCount(Long defrayCategoryCount) { this.defrayCategoryCount = defrayCategoryCount; } @Override public String toString() { return "ComparableCategoryKey{" + "categoryId='" + categoryId + '\'' + ", clickCategoryCount=" + clickCategoryCount + ", orderCategoryCount=" + orderCategoryCount + ", defrayCategoryCount=" + defrayCategoryCount + '}'; } }
具体的调用过程:
// 1.封装成(categoryKey,info) JavaPairRDD<ComparableCategoryKey,String> comparableCategory2AllCountRDD = categoryId2allCount.mapToPair( new PairFunction<Tuple2<String, String>, ComparableCategoryKey,String>() { @Override public Tuple2<ComparableCategoryKey,String> call(Tuple2<String, String> stringStringTuple2) throws Exception { String categoryId = stringStringTuple2._1; String allCount = stringStringTuple2._2; String[] tmpAllCountSplited = allCount.split("_"); Long clickCount = Long.valueOf(tmpAllCountSplited[0]); Long orderCount = Long.valueOf(tmpAllCountSplited[1]); Long defrayCount = Long.valueOf(tmpAllCountSplited[2]); ComparableCategoryKey comparableCategoryWithAllCount = new ComparableCategoryKey(); comparableCategoryWithAllCount.setCategoryId(categoryId); comparableCategoryWithAllCount.setClickCategoryCount(clickCount); comparableCategoryWithAllCount.setOrderCategoryCount(orderCount); comparableCategoryWithAllCount.setDefrayCategoryCount(defrayCount); return new Tuple2<>(comparableCategoryWithAllCount,allCount); } } ); // 2.sortByKey 排序 comparableCategory2AllCountRDD.sortByKey(); // 3.获取前五 List<Tuple2<ComparableCategoryKey,String>> top10Categorys = comparableCategory2AllCountRDD.take(5);