大数据开发
SparkSession spark = SparkSession.builder()
.appName("spark-item")
.config("spark.sql.warehouse.dir", warehouse_location)
.enableHiveSupport()
.config("spark.sql.adaptive.enabled",true)
.config("spark.sql.hive.mergeFiles",true)
.config("spark.executor.heartbeatInterval", "18000")
.config("spark.network.timeout", "36000")
.config("spark.sql.broadcastTimeout", "1800")
.getOrCreate();
/**
*
*/
// 通过enconder对数据类进行Dataset转化
String reduplicatedListSql = GetExecuteSql.getFocusDataMapAddItemGdResultList(yesterday);
Encoder<ItemFocusDataMapItemGd> itemFocusDataMapItemGdEncoder = Encoders.bean(ItemFocusDataMapItemGd.class);
Dataset<ItemFocusDataMapItemGd> itemMixedEtlResultDataset = spark.sql(reduplicatedListSql).as(itemFocusDataMapItemGdEncoder);
Encoder<ItemMixedEtlResultTmp> itemMixedEtlResultTmpEncoder = Encoders.bean(ItemMixedEtlResultTmp.class);
Dataset<ItemMixedEtlResultTmp> itemMixedEtlDuplicatedResultDataset = itemMixedEtlResultDataset.repartition(repartitions).
mapPartitions(new MapPartitionFocusDataMapAddItem(),itemMixedEtlResultTmpEncoder);
itemMixedEtlDuplicatedResultDataset.createOrReplaceTempView("result_data");
spark.sqlContext().setConf("hive.merge.mapfiles","true");
spark.sqlContext().setConf("hive.merge.mapredfiles","true");
spark.sqlContext().setConf("hive.merge.size.per.task","167772160");
spark.sqlContext().setConf("hive.merge.smallfiles.avgsize","125829120");
spark.sqlContext().setConf("hive.exec.dynamic.partition", "true");
spark.sqlContext().setConf("hive.exec.dynamic.partition.mode", "nonstrict");
// 对生成的临时表进行去重
spark.sql(GetExecuteSql.getInsertFocusDataMapAddItemGdSql(source));
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)