欢迎这位怪蜀黍来到《项目实战从0到1之Spark(10)Spark读取HDFS写入Hive - 大码王 - 博客园》

关闭页面特效
复制代码
package com.xxxx.report.service;

import com.google.common.collect.Lists;
import com.xx.report.config.Constants;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.List;

/**
 * @author huanghanyu
 */
public class BicycleLog2hive implements Serializable{
    // Log日志
    private static final Logger LOG = LoggerFactory.getLogger(BicycleLog2hive.class);
    // 日期格式化
    private static SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMdd");
    private static final String TMP_TABLE_NAME = "tableNameTemp";
    private static final String TABLE_NAME = "tableName1";
    private static final String APP_NAME = "xxxxx@yangxin";

    private EngineLockLog handleLine(String line) {
        EngineLockLog engineLockLog = new EngineLockLog();
        try {
            System.out.println("handleLine Function -> : " + line);
            xxxxxxxxxxxxxxxxx
            xxxxxxxxxxxxxxx
            xxxxxxxxxxxx
        }catch (Exception error) {
            System.out.println(error.getMessage() + " | " + line);
            error.printStackTrace();
        }
        return engineLockLog;
    }

    public void run(String master, String startTime, String endTime) {
        long startTimsStamp = System.currentTimeMillis();
        startTime = startTime.replace("-", "");
        startTime = startTime.replace("_", "");
        endTime = endTime.replace("-", "");
        endTime = endTime.replace("_", "");
        SparkSession spark = SparkSession.builder().appName(APP_NAME).enableHiveSupport().getOrCreate();
        List<String> list = Lists.newArrayList();
        Calendar calendar = Calendar.getInstance();
        calendar.set(Calendar.YEAR, Integer.valueOf(startTime.substring(0, 4)));
        calendar.set(Calendar.MONTH, Integer.valueOf(startTime.substring(4, 6)) - 1);
        calendar.set(Calendar.DATE, Integer.valueOf(startTime.substring(6, 8)));
        String date = startTime;
        while (!date.equals(endTime)) {
            list.add(date);
            calendar.add(Calendar.DATE, 1);
            date = simpleDateFormat.format(calendar.getTime());
        }
        list.add(endTime);
        for (String day : list) {
            LOG.info("日期:-> " + day);
            StringBuilder path = new StringBuilder();
            path.append(Constants.PREFIX_BICYCLE_LOG_PATH_YangXin).append(day).append("/*/*");
            LOG.info("路径:-> " + path);
            JavaRDD<EngineLockLog> mapRDD = spark.read().textFile(path.toString()).
                    javaRDD().
                    map(line -> {
                        return handleLine(line);
                    }).filter(new Function<EngineLockLog, Boolean>() {
                @Override
                public Boolean call(EngineLockLog engineLockLog) throws Exception {
                    return engineLockLog.getUser_id() != null;
                }
            });
            if (!mapRDD.isEmpty()) {
                Dataset<Row> mapDF = spark.createDataFrame(mapRDD, EngineLockLog.class);
                mapDF.createOrReplaceTempView(TMP_TABLE_NAME);
                String dayTemp = day.substring(0, 4) + "-" + (day.substring(4,6)) + "-" + day.substring(6, 8);
                String insertSQL = "insert into table " + TABLE_NAME + " partition(dt=\'" + dayTemp + "\') " +
                        "select xxxx,xxxxx,xxxxx from " + TMP_TABLE_NAME;
                spark.sql(insertSQL);
            }
        }
        long endTimeStamp = System.currentTimeMillis();
        System.out.println("总耗时: -> " + (endTimeStamp - startTimsStamp) + "ms");
    }

    public static void main(String[] args) {
        String master = args[0];
        String startTime = args[1];
        String endTime = args[2];
        BicycleLog2hive bicycleLog2hive = new BicycleLog2hive();
        bicycleLog2hive.run(master, startTime, endTime);
    }
}
复制代码

 

 posted on   大码王  阅读(841)  评论(1编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具

成都

复制代码

喜欢请打赏

扫描二维码打赏

了解更多

点击右上角即可分享
微信分享提示