impala通过jdbc写入效率低下优化
背景:
Kudu+Impala很适合数据分析, 但直接使用Insert values语句往Kudu表插入数据, 效率实在不好, 测试下来insert的速度仅为80笔/秒. 原因也是显然的, Kudu本身写入效率很高, 但是Impala并没有做这方面优化, 观察下来每次Impala语句执行的overhead都太大了, 导致频繁小批次写入效率非常差, Kudu官方推荐使用Java API或Python API完成数据写入工作. 下面是使用Java API的测试用例, 也可以看出Kudu API的大致用法.
解决方法:
1、采用外部表的方式,对于自动化流程比较麻烦
2、采用文件导入,对于自动化流程比较麻烦
3、使用kudu对应原生的java api,用于提升写入、更新和删除操作,本文主要是采用java api的方式
maven依赖:
<dependency> <groupId>org.apache.kudu</groupId> <artifactId>kudu-client</artifactId> <!-- <version>1.4.0</version>--> <version>1.11.1</version> </dependency>
写入程序:
package org.kudu; import org.apache.kudu.client.*; import java.sql.*; import java.util.Date; public class InsertRow { public static void main(String[] args) throws Exception { // impala链接地址:用于模拟数据 Connection con = null; ResultSet rs = null; PreparedStatement ps = null; String JDBC_DRIVER = "com.cloudera.impala.jdbc41.Driver"; String CONNECTION_URL = "jdbc:impala://ecs-kudu-0004:21051/pro_dfcv_tsp"; Class.forName(JDBC_DRIVER); con = (Connection) DriverManager.getConnection(CONNECTION_URL); // kudu 链接地址 String masterAddr = "cdh02.dongfeng.com:7051"; KuduClient client = new KuduClient.KuduClientBuilder(masterAddr) .defaultSocketReadTimeoutMs(6000).build(); // 对应的表名称:这个表名称特殊,补全前缀 KuduTable table = client.openTable("impala::pro_dfcv_tsp.hy_car2"); KuduSession kuduSession = client.newSession(); // 采用手动刷新 kuduSession.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH); // 用于批量写入的临时缓冲区要足够大 kuduSession.setMutationBufferSpace(300000000); // 一共73个字段,取出15000条 String sql = "select * from hy_car limit 15000"; ps = con.prepareStatement(sql); rs = ps.executeQuery(); final ResultSetMetaData metaData = rs.getMetaData(); final int columnCount = metaData.getColumnCount(); int idx = 0; while (rs.next()) { idx++; // 创建每条的Insert Insert insert = table.newInsert(); for(int i=1;i<=columnCount;i++){ final String columnTypeName = metaData.getColumnTypeName(i); if ("BIGINT".equals(columnTypeName)){ insert.getRow().addLong(metaData.getColumnName(i), rs.getLong(i)); }else if ("INT".equals(columnTypeName)){ insert.getRow().addInt(metaData.getColumnName(i), rs.getInt(i)); }else{ if (rs.getString(i)!= null && rs.getString(i).equals("")){ insert.getRow().addString(metaData.getColumnName(i), ""); } } } // 先不提交kudu kuduSession.apply(insert); if (idx % 2000 == 0){ System.out.println(new Date()); // 批量写入kudu kuduSession.flush(); } } // 最后,补加一条批量写入 kuduSession.flush(); // 关闭和kudu的会话 kuduSession.close(); // 关闭impala的链接 con.close(); } }
更改表数据:
package cn.wcb.kudu; import org.apache.kudu.client.*; public class UpdateRow { public static void main(String[] args) { //master地址 String masterAddr = "192.168.56.56"; KuduClient client = new KuduClient.KuduClientBuilder(masterAddr) .defaultSocketReadTimeoutMs(6000).build(); try { KuduTable table = client.openTable("student"); KuduSession session = client.newSession(); session.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_SYNC); //更新数据 Update update = table.newUpdate(); PartialRow row = update.getRow(); row.addInt("id",1); row.addString("name","didi"); session.apply(update); session.close(); } catch (KuduException e) { e.printStackTrace(); } finally { try { client.close(); } catch (KuduException e) { e.printStackTrace(); } } } }
删除数据:
package cn.wcb.kudu; import org.apache.kudu.client.*; public class DeleteRow { public static void main(String[] args) { //master地址 String masterAddr = "192.168.56.56"; KuduClient client = new KuduClient.KuduClientBuilder(masterAddr) .defaultSocketReadTimeoutMs(6000).build(); try { KuduTable table = client.openTable("student"); KuduSession session = client.newSession(); //删除数据 Delete delete = table.newDelete(); //注意:行删除和更新操作必须指定要更改行的行的完整主键 delete.getRow().addInt("id",1); session.flush(); session.apply(delete); session.close(); } catch (KuduException e) { e.printStackTrace(); } finally { try { client.close(); } catch (KuduException e) { e.printStackTrace(); } } } }
效果: