spark通过pipline方式批量插入redis集群方式
spark通过pipline方式批量插入redis集群网上资料比较少,但是有一大堆都是单机的方式,spring倒是也有写入redis集群的实现代码,以下整理了spark通过pipline批量写入的方式,速度确实快,不然一条条set进去,真的是天都要黑了。
依赖到的maven有以下(spark忽略):
<dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.5.2</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>3.3.0</version> </dependency>
以下是spark集成redis cluster部分例子:
import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import redis.clients.jedis.HostAndPort; import redis.clients.jedis.JedisCluster; import redis.clients.jedis.JedisPoolConfig; import java.util.HashSet; import java.util.Set; public class SparkPiplineRedis { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setAppName("test").setMaster("local"); SparkSession session = SparkSession.builder().config(conf).getOrCreate(); Dataset<Row> dataset = session.sql("" + "select '1001' id,'jeff' name,1 age " + "union all " + "select '1002' id,'kitty' name,2 age "); String hosts = ""; String ports = ""; dataset.foreachPartition(iter->{ JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); jedisPoolConfig.setMaxTotal(10);// 最大连接数, 默认8个 jedisPoolConfig.setMaxIdle(10);// redis.maxIdle jedisPoolConfig.setMaxWaitMillis(2000);// 2s jedisPoolConfig.setTestOnBorrow(true); Set<HostAndPort> hostAndPortsSet = new HashSet<HostAndPort>(); for (String ip : hosts.split(",")) { for (String port : ports.split(",")) { hostAndPortsSet.add(new HostAndPort(ip, Integer.parseInt(port))); } } JedisCluster jedisCluster = new JedisCluster(hostAndPortsSet, jedisPoolConfig); JedisClusterPipeline jedisClusterPipeline = new JedisClusterPipeline(jedisCluster); while (iter.hasNext()){ Row row = iter.next(); String id = row.getAs("id").toString(); String name = row.getAs("name").toString(); jedisClusterPipeline.hsetByPipeline("TEST:PERSON",id,name); } jedisClusterPipeline.releaseConnection(); }); session.stop(); } }
jedisCluster管道方式实现代码如下(转自哪里忘了):
import org.apache.ibatis.reflection.MetaObject; import org.apache.ibatis.reflection.SystemMetaObject; import redis.clients.jedis.*; import redis.clients.jedis.exceptions.JedisNoReachableClusterNodeException; import redis.clients.jedis.util.JedisClusterCRC16; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; /** * 基于JedisCluster实现管道的使用 * 核心对象:JedisClusterInfoCache和JedisSlotBasedConnectionHandler * 使用构造方法将JedisCluster对象传递进来 */ public class JedisClusterPipeline { /** * 构造方法 * 通过JedisCluster获取JedisClusterInfoCache和JedisSlotBasedConnectionHandler * @param jedisCluster */ public JedisClusterPipeline(JedisCluster jedisCluster){ this.jedisCluster = jedisCluster; MetaObject metaObject = SystemMetaObject.forObject(jedisCluster); clusterInfoCache = (JedisClusterInfoCache)metaObject.getValue("connectionHandler.cache"); connectionHandler = (JedisSlotBasedConnectionHandler)metaObject.getValue("connectionHandler"); } /** 管道命令提交阈值 */ private final int MAX_COUNT = 10000; /** Redis集群缓存信息对象 Jedis提供*/ private JedisClusterInfoCache clusterInfoCache; /** Redis链接处理对象 继承于JedisClusterConnectionHandler,对其提供友好的调用方法 Jedis提供 */ private JedisSlotBasedConnectionHandler connectionHandler; /** Redis集群操作对象 Jedis提供 */ private JedisCluster jedisCluster; /** 存储获取的Jedis对象,用于统一释放对象 */ private CopyOnWriteArrayList<Jedis> jedisList = new CopyOnWriteArrayList(); /** 存储获取的Jedis连接池对象与其对应开启的管道,用于保证slot(哈希槽)对应的节点链接的管道只被开启一次 */ private ConcurrentHashMap<JedisPool, Pipeline> pipelines = new ConcurrentHashMap<>(); /** 存储每个开启的管道需要处理的命令(数据)数,当计数达到提交阈值时进行提交 */ private ConcurrentHashMap<Pipeline, Integer> nums = new ConcurrentHashMap<>(); public void hsetByPipeline(String key, String field, String value){ Pipeline pipeline = getPipeline(key); pipeline.hset(key, field, value); nums.put(pipeline, nums.get(pipeline) + 1); this.maxSync(pipeline); } /** * 释放获取的Jedis链接 * 释放的过程中会强制执行PipeLine sync */ public void releaseConnection() { jedisList.forEach(jedis -> jedis.close()); } /** * 获取JedisPool * 第一次获取不到尝试刷新缓存的SlotPool再获取一次 * @param key * @return */ private JedisPool getJedisPool(String key){ /** 通过key计算出slot */ int slot = JedisClusterCRC16.getSlot(key); /** 通过slot获取到对应的Jedis连接池 */ JedisPool jedisPool = clusterInfoCache.getSlotPool(slot); if(null != jedisPool){ return jedisPool; }else{ /** 刷新缓存的SlotPool */ connectionHandler.renewSlotCache(); jedisPool = clusterInfoCache.getSlotPool(slot); if (jedisPool != null) { return jedisPool; } else { throw new JedisNoReachableClusterNodeException("No reachable node in cluster for slot " + slot); } } } /** * 获取Pipeline对象 * 缓存在pipelines中,保证集群中同一节点的Pipeline只被开启一次 * 管道第一次开启,jedisList,pipelines,nums存入与该管道相关信息 * @param key * @return */ private Pipeline getPipeline(String key){ JedisPool jedisPool = getJedisPool(key); /** 检查管道是否已经开启 */ Pipeline pipeline = pipelines.get(jedisPool); if(null == pipeline){ Jedis jedis = jedisPool.getResource(); pipeline = jedis.pipelined(); jedisList.add(jedis); pipelines.put(jedisPool, pipeline); nums.put(pipeline, 0); } return pipeline; } /** * 管道对应的命令计数,并在达到阈值时触发提交 * 提交后计数归零 * @param pipeline * @return */ private void maxSync(Pipeline pipeline){ Integer num = nums.get(pipeline); if(null != num){ if(num % MAX_COUNT == 0){ pipeline.sync(); nums.put(pipeline, 0); } } } }