Distinct

1.topology

package com.suning.yystorm.business.adunion.uv;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.suning.yystorm.business.adunion.uv.bolt.UVAppFilterBolt;
import com.suning.yystorm.business.adunion.uv.bolt.UVAppStockFilterBolt;
import com.suning.yystorm.business.adunion.uv.bolt.UVCacheBolt;
import com.suning.yystorm.business.adunion.uv.bolt.UVWapFilterBolt;
import com.suning.yystorm.business.adunion.uv.bolt.UVWapStockFilterBolt;

import com.suning.yystorm.utils.PropertyUtils;

import backtype.storm.Config;
import backtype.storm.StormSubmitter;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;

public class UVTopology {
    private static final Logger LOGGER = LoggerFactory.getLogger(UVTopology.class);

    public static void main(String[] args) {
        try {
            Config config = new Config();
            config.setNumWorkers(8);
            config.setDebug(false);
            config.setMaxSpoutPending(3000);
            config.setMessageTimeoutSecs(180);

            String kafkaZKNode = PropertyUtils.getInstance().getValueByKey("kafka.zookeeper.nodes");
            String kafkaZKRoot = "/xx";
            String kafkaConsumerId = "tweeter_uv_v10";

            BrokerHosts brokerHosts = new ZkHosts(kafkaZKNode);
            // 接入kafka数据
            SpoutConfig wapSpoutConfig = new SpoutConfig(brokerHosts, "a", kafkaZKRoot, kafkaConsumerId + "_wap");
            wapSpoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
            wapSpoutConfig.forceFromStart = false;
            wapSpoutConfig.socketTimeoutMs = 3000;

            SpoutConfig appSpoutConfig = new SpoutConfig(brokerHosts, "b", kafkaZKRoot, kafkaConsumerId + "_app");
            appSpoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
            appSpoutConfig.forceFromStart = false;
            appSpoutConfig.socketTimeoutMs = 3000;

            SpoutConfig stockWapSpoutConfig = new SpoutConfig(brokerHosts, "c", kafkaZKRoot, kafkaConsumerId + "_stockWap");
            stockWapSpoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
            stockWapSpoutConfig.forceFromStart = false;
            stockWapSpoutConfig.socketTimeoutMs = 3000;

            SpoutConfig stockAppSpoutConfig = new SpoutConfig(brokerHosts, "d", kafkaZKRoot, kafkaConsumerId + "_stockApp");
            stockAppSpoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
            stockAppSpoutConfig.forceFromStart = false;
            stockAppSpoutConfig.socketTimeoutMs = 3000;
            // 数据处理
            TopologyBuilder builder = new TopologyBuilder();

            builder.setSpout("wapSpout", new KafkaSpout(wapSpoutConfig), 10);
            builder.setBolt("wapFilterBolt", new UVWapFilterBolt(), 40).shuffleGrouping("wapSpout");
            builder.setSpout("appSpout", new KafkaSpout(appSpoutConfig), 36);
            builder.setBolt("appFilterBolt", new UVAppFilterBolt(), 80).shuffleGrouping("appSpout");

            builder.setSpout("stockWapSpout", new KafkaSpout(stockWapSpoutConfig), 10);
            builder.setBolt("stockWapFilterBolt", new UVWapStockFilterBolt(), 20).shuffleGrouping("stockWapSpout");
            builder.setSpout("stockAppSpout", new KafkaSpout(stockAppSpoutConfig), 10);
            builder.setBolt("stockAppFilterBolt", new UVAppStockFilterBolt(), 40).shuffleGrouping("stockAppSpout");

            builder.setBolt("uvCacheBolt", new UVCacheBolt(), 60).fieldsGrouping("appFilterBolt", new Fields("dimension")).fieldsGrouping("wapFilterBolt", new Fields("dimension")).fieldsGrouping("stockWapFilterBolt", new Fields("dimension")).fieldsGrouping("stockAppFilterBolt", new Fields("dimension"));
            runCluster(config, builder);
        } catch (Exception e) {
            LOGGER.info("error,", e);
        }
    }
    
    private static void runCluster(Config config, TopologyBuilder builder) {
        try {
            StormSubmitter.submitTopology("TweeterUVTopology_v10", config, builder.createTopology());
        } catch (Exception e) {
            LOGGER.info("TweeterUVTopology_v10提交拓扑异常", e);
        }
    }
}

2.缓存,10秒记录一次

package com.suning.yystorm.business.adunion.uv.bolt;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.suning.yystorm.business.adunion.AdUnionConsts;
import com.suning.yystorm.comm.hbase.HbaseClient;
import com.suning.yystorm.comm.hbase.HbaseClientImpl;
import com.suning.yystorm.utils.Distinct;
import com.suning.yystorm.utils.FixedMap;
import com.suning.yystorm.utils.TupleHelpers;

import backtype.storm.Config;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

/**
 * 
 * 〈一句话功能简述〉<br>
 * 〈功能详细描述〉
 *
 * @author 15060063
 * @see [相关类/方法](可选)
 * @since [产品/模块版本] (可选)
 */
@SuppressWarnings("rawtypes")
public class UVCacheBolt extends BaseRichBolt {

    private static final long serialVersionUID = 3384989044692629267L;

    private static final Logger LOGGER = LoggerFactory.getLogger(UVCacheBolt.class);

    private OutputCollector collector = null;
    private FixedMap<String, Long> map = null;
    private Distinct distinct = null;
    private HbaseClient client = null;
    private DateFormat format;

    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.collector = collector;
        this.map = new FixedMap<String, Long>(AdUnionConsts.MAP_COUNT_UV_MAX);
        this.distinct = new Distinct(AdUnionConsts.MAP_COUNT_UV_MAX, "yyyyMMdd");
        this.client = new HbaseClientImpl();
        this.format = new SimpleDateFormat("yyyyMMddHHmmss");
    }

    @Override
    public void execute(Tuple tuple) {
        if (TupleHelpers.isTickTuple(tuple)) {

            if (!map.isEmpty()) {
                try {
                    client.insertBatchDataByLongMd5(AdUnionConsts.TWEETER_UV, map, format.format(new Date()), "info", false);
                    map.clear();
                } catch (Exception e) {
                    LOGGER.error("推客UV数UVCacheBolt存储异常", e);
                }
            }
            
        } else {
            try {
                String dimension = tuple.getStringByField("dimension");
                String uvid = tuple.getStringByField("uvid");

                long old = distinct.cardinality(dimension);

                distinct.easyOffer(dimension, uvid);
                long newCard = distinct.cardinality(dimension);
                if (old != newCard) {
                    map.put(dimension, newCard);
                }

            } catch (Exception e) {
                LOGGER.info("tweeter UVCache异常", e);
            }

            this.collector.ack(tuple);
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {

    }

    @Override
    public Map<String, Object> getComponentConfiguration() {
        Map<String, Object> conf = new HashMap<String, Object>();
        conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 10);
        return conf;
    }
}

 hour

package com.suning.yystorm.business.adunion.houruv.bolt;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.suning.yystorm.business.adunion.AdUnionConsts;
import com.suning.yystorm.comm.hbase.HbaseClient;
import com.suning.yystorm.comm.hbase.HbaseClientImpl;
import com.suning.yystorm.utils.Distinct;
import com.suning.yystorm.utils.FixedMap;
import com.suning.yystorm.utils.TupleHelpers;

import backtype.storm.Config;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

/**
 * 
 * 〈一句话功能简述〉<br>
 * 〈功能详细描述〉
 *
 * @author 15060063
 * @see [相关类/方法](可选)
 * @since [产品/模块版本] (可选)
 */
@SuppressWarnings("rawtypes")
public class HourUVCacheBolt extends BaseRichBolt {

    private static final long serialVersionUID = 3384989044692629267L;

    private static final Logger LOGGER = LoggerFactory.getLogger(HourUVCacheBolt.class);

    private OutputCollector collector = null;
    private FixedMap<String, Long> map = null;
    private Distinct distinct = null;
    private HbaseClient client = null;
    private DateFormat format;

    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.collector = collector;
        this.map = new FixedMap<String, Long>(AdUnionConsts.MAP_COUNT_UV_MAX);
        this.distinct = new Distinct(AdUnionConsts.MAP_COUNT_UV_MAX, "yyyyMMddHH");
        this.client = new HbaseClientImpl();
        this.format = new SimpleDateFormat("yyyyMMddHHmmss");
    }

    @Override
    public void execute(Tuple tuple) {
        if (TupleHelpers.isTickTuple(tuple)) {

            if (!map.isEmpty()) {
                try {
                    client.insertBatchDataByLongMd5(AdUnionConsts.TWEETER_HOUR_UV, map, format.format(new Date()), "info", false);
                    map.clear();
                } catch (Exception e) {
                    LOGGER.error("推客hour UV数HourUVCacheBolt存储异常", e);
                }
            }
            
        } else {
            try {
                String dimension = tuple.getStringByField("dimension");
                String uvid = tuple.getStringByField("uvid");

                long old = distinct.cardinality(dimension);

                distinct.easyOffer(dimension, uvid);
                long newCard = distinct.cardinality(dimension);
                if (old != newCard) {
                    map.put(dimension, newCard);
                }

            } catch (Exception e) {
                LOGGER.info("tweeter hour UVCache异常", e);
            }

            this.collector.ack(tuple);
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {

    }

    @Override
    public Map<String, Object> getComponentConfiguration() {
        Map<String, Object> conf = new HashMap<String, Object>();
        conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 10);
        return conf;
    }
}

 

3.去重

package com.suning.yystorm.utils;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

import com.clearspring.analytics.stream.cardinality.AdaptiveCounting;
import com.clearspring.analytics.stream.cardinality.ICardinality;

public class Distinct {

    private FixedMap<String, ICardinality> cardinalityMap = null;

    private DateFormat format = null;
    private String time = null;

    /**
     * 在不同时间范围内排重,eg:按天排重,按小时排重等
     * 
     * @param size
     * @param _format 格式化日期:eg:yyyyMMdd,yyyyMMddHH
     */
    public Distinct(int size, String _format) {
        cardinalityMap = new FixedMap<String, ICardinality>(size);

        format = new SimpleDateFormat(_format);
        time = format.format(new Date());
    }

    /**
     * 
     * 功能描述: <br>
     * 〈功能详细描述〉
     *
     * @param dimension
     * @param keyword
     * @return 重复:false,不重复:true
     * @see [相关类/方法](可选)
     * @since [产品/模块版本](可选)
     */
    public boolean offer(String dimension, String keyword) {
        String _time = format.format(new Date());
        if (!time.equalsIgnoreCase(_time)) {
            cardinalityMap.clear();
            time = _time;
        }

        ICardinality cardinality = cardinalityMap.get(dimension);
        if (cardinality == null) {
            synchronized (cardinalityMap) {
                if (cardinality == null) {
                    cardinalityMap.put(dimension, AdaptiveCounting.Builder.obyCount(Integer.MAX_VALUE).build());
                }
            }
        }

        cardinality = cardinalityMap.get(dimension);

        return cardinality.offer(keyword);
    }
    
    public boolean easyOffer(String dimension, String keyword) {
        String today = format.format(new Date());
        if (!time.equals(today)) {
            cardinalityMap.clear();
            time = today;
        }

        ICardinality cardinality = cardinalityMap.get(dimension);
        if (cardinality == null) {
            cardinalityMap.put(dimension, AdaptiveCounting.Builder.obyCount(Integer.MAX_VALUE).build());
        }

        cardinality = cardinalityMap.get(dimension);

        return cardinality.offer(keyword);
    }

    public long cardinality(String dimension) {
        ICardinality card = cardinalityMap.get(dimension);
        if (null != card) {
            return card.cardinality();
        } else {
            return 0;
        }
    }

    public ICardinality getCard(String dimension) {
        String _time = format.format(new Date());
        if (!time.equalsIgnoreCase(_time)) {
            cardinalityMap.clear();
            time = _time;
        }

        ICardinality cardinality = cardinalityMap.get(dimension);
        if (cardinality == null) {
            synchronized (cardinalityMap) {
                if (cardinalityMap.get(dimension) == null) {
                    cardinalityMap.put(dimension, AdaptiveCounting.Builder.obyCount(Integer.MAX_VALUE).build());
                }
            }
        }

        cardinality = cardinalityMap.get(dimension);

        return cardinality;
    }

    public boolean setCard(String dimension, ICardinality cardinality) {
        cardinalityMap.put(dimension, cardinality);
        return true;
    }

    public FixedMap<String, ICardinality> getCardinalityMap() {
        return cardinalityMap;
    }
    
}

 

posted on 2019-05-17 19:36  cxhfuujust  阅读(391)  评论(0编辑  收藏  举报

导航