spark更新插入(upsert)到mysql方式

 

spark数据有时候需要插入mysql中的数据,有时候存在的更新已经存在的数据,也就是mpp中的upsert操作,但是spark暂时给的api只有overwrite, append等,无法满足插入更新业务逻辑;

因此需要自定义实现,可以自行批量方式处理,例如:

insert into personinfo(id,name,age) values('1','2',8) on duplicate key update id = values(id),name = values(name),age = values(age);

因此,我们可以通过spark的方式组装以上代码就可以实现upsert功能(网上很多都是用scala实现的,这里稍微改成了java):

public class Test01 extends BasicsFunction {
    private static final Log logger = LogFactory.getLog(Test01.class);

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf();//配置环境
        sparkConf.setMaster("local[*]").setAppName("test");
        SparkSession session = SparkSession.builder().config(conf).getOrCreate();
        
        Dataset<Row> dataset = session.sql("" +
                        "select '1001' id,'jeff' name,2 age " +
                        "union all " +
                        "select '1001' id,'tony' name,3 age " +
                        "union all " +
                        "select '1002' id,'kitty' name,2 age ");
        dataset.show(3);
        
        String table = "personinfo";
        upsertMysql(dataset, table);
    }
    
    public void upsertMysql(Dataset<Row> dataset, String table) {
        String[] cols = dataset.columns();
        StringBuffer sqlStr = new StringBuffer();
        // 构建 values(?,?)
        StringBuffer sqlValue = new StringBuffer();

        sqlStr.append("insert into "+ table +"(");

        for (int i = 0; i < cols.length; i++) {
            String field = cols[i];
            sqlStr.append(field + ",");
            sqlValue.append("?,");
        }
        sqlStr.replace(sqlStr.length() - 1, sqlStr.length(), ") values(").append(sqlValue.replace(sqlValue.length() - 1, sqlValue.length(), ") on duplicate key update "));

//        String[] updateCols = {"name", "age"};
        // 拼接更新字段
        for (String field : cols) {
            if (field.contains("remain_status"))
                sqlStr.append(""+field+" = concat("+field+" , ',',values("+field+")),");
            else
                sqlStr.append(""+field+" = values("+field+"),");
        }
        String result_sql = sqlStr.substring(0, sqlStr.length() - 1);
        logger.warn("【拼接sql】" + result_sql);

        dataset.repartition(1).foreachPartition(iter -> {
            Connection conn = CustomMysqlUtil.getInstance().getConn();
            PreparedStatement ps = conn.prepareStatement(result_sql.toString());
            int count = 0;
            int[] batch = {};
            try {
                conn.setAutoCommit(false);
                while (iter.hasNext()) {
                    Row row = iter.next();
                    StructField[] fields = row.schema().fields();
                    for (int i = 0; i < fields.length; i++) {
                        ps.setObject(i + 1, row.getAs(i));
                    }
                    ps.addBatch();
                    count += 1;
                    if (count % 10000 == 0) {
                        batch = ps.executeBatch();
                    }
                }
                conn.commit();
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                batch = ps.executeBatch();
                logger.warn("【update info】"+batch);
                conn.commit();
                close(conn, ps);
            }
        });
    }

    private void close(Connection conn, PreparedStatement ps) throws Exception {
        if (conn != null) {
            conn.close();
        }

        if (ps != null) {
            ps.close();
        }
    }
}

以上便以简单的方式实现了更新mysql数据的功能,坑爹的是好像效率并不是特别好,尴尬.....

 

posted @ 2022-10-16 20:31  消失的白桦林  阅读(1231)  评论(0编辑  收藏  举报