spark更新插入(upsert)到mysql方式
spark数据有时候需要插入mysql中的数据,有时候存在的更新已经存在的数据,也就是mpp中的upsert操作,但是spark暂时给的api只有overwrite, append等,无法满足插入更新业务逻辑;
因此需要自定义实现,可以自行批量方式处理,例如:
insert into personinfo(id,name,age) values('1','2',8) on duplicate key update id = values(id),name = values(name),age = values(age);
因此,我们可以通过spark的方式组装以上代码就可以实现upsert功能(网上很多都是用scala实现的,这里稍微改成了java):
public class Test01 extends BasicsFunction { private static final Log logger = LogFactory.getLog(Test01.class); public static void main(String[] args) { SparkConf sparkConf = new SparkConf();//配置环境 sparkConf.setMaster("local[*]").setAppName("test"); SparkSession session = SparkSession.builder().config(conf).getOrCreate(); Dataset<Row> dataset = session.sql("" + "select '1001' id,'jeff' name,2 age " + "union all " + "select '1001' id,'tony' name,3 age " + "union all " + "select '1002' id,'kitty' name,2 age "); dataset.show(3); String table = "personinfo"; upsertMysql(dataset, table); } public void upsertMysql(Dataset<Row> dataset, String table) { String[] cols = dataset.columns(); StringBuffer sqlStr = new StringBuffer(); // 构建 values(?,?) StringBuffer sqlValue = new StringBuffer(); sqlStr.append("insert into "+ table +"("); for (int i = 0; i < cols.length; i++) { String field = cols[i]; sqlStr.append(field + ","); sqlValue.append("?,"); } sqlStr.replace(sqlStr.length() - 1, sqlStr.length(), ") values(").append(sqlValue.replace(sqlValue.length() - 1, sqlValue.length(), ") on duplicate key update ")); // String[] updateCols = {"name", "age"}; // 拼接更新字段 for (String field : cols) { if (field.contains("remain_status")) sqlStr.append(""+field+" = concat("+field+" , ',',values("+field+")),"); else sqlStr.append(""+field+" = values("+field+"),"); } String result_sql = sqlStr.substring(0, sqlStr.length() - 1); logger.warn("【拼接sql】" + result_sql); dataset.repartition(1).foreachPartition(iter -> { Connection conn = CustomMysqlUtil.getInstance().getConn(); PreparedStatement ps = conn.prepareStatement(result_sql.toString()); int count = 0; int[] batch = {}; try { conn.setAutoCommit(false); while (iter.hasNext()) { Row row = iter.next(); StructField[] fields = row.schema().fields(); for (int i = 0; i < fields.length; i++) { ps.setObject(i + 1, row.getAs(i)); } ps.addBatch(); count += 1; if (count % 10000 == 0) { batch = ps.executeBatch(); } } conn.commit(); } catch (Exception e) { e.printStackTrace(); } finally { batch = ps.executeBatch(); logger.warn("【update info】"+batch); conn.commit(); close(conn, ps); } }); } private void close(Connection conn, PreparedStatement ps) throws Exception { if (conn != null) { conn.close(); } if (ps != null) { ps.close(); } } }
以上便以简单的方式实现了更新mysql数据的功能,坑爹的是好像效率并不是特别好,尴尬.....