大数据开发

SparkSession spark = SparkSession.builder()
.appName("spark-item")
.config("spark.sql.warehouse.dir", warehouse_location)
.enableHiveSupport()
.config("spark.sql.adaptive.enabled",true)
.config("spark.sql.hive.mergeFiles",true)
.config("spark.executor.heartbeatInterval", "18000")
.config("spark.network.timeout", "36000")
.config("spark.sql.broadcastTimeout", "1800")
.getOrCreate();
/**
*
*/

// 通过enconder对数据类进行Dataset转化
String reduplicatedListSql = GetExecuteSql.getFocusDataMapAddItemGdResultList(yesterday);
Encoder<ItemFocusDataMapItemGd> itemFocusDataMapItemGdEncoder = Encoders.bean(ItemFocusDataMapItemGd.class);
Dataset<ItemFocusDataMapItemGd> itemMixedEtlResultDataset = spark.sql(reduplicatedListSql).as(itemFocusDataMapItemGdEncoder);
Encoder<ItemMixedEtlResultTmp> itemMixedEtlResultTmpEncoder = Encoders.bean(ItemMixedEtlResultTmp.class);
Dataset<ItemMixedEtlResultTmp> itemMixedEtlDuplicatedResultDataset = itemMixedEtlResultDataset.repartition(repartitions).
mapPartitions(new MapPartitionFocusDataMapAddItem(),itemMixedEtlResultTmpEncoder);
itemMixedEtlDuplicatedResultDataset.createOrReplaceTempView("result_data");
spark.sqlContext().setConf("hive.merge.mapfiles","true");
spark.sqlContext().setConf("hive.merge.mapredfiles","true");
spark.sqlContext().setConf("hive.merge.size.per.task","167772160");
spark.sqlContext().setConf("hive.merge.smallfiles.avgsize","125829120");
spark.sqlContext().setConf("hive.exec.dynamic.partition", "true");
spark.sqlContext().setConf("hive.exec.dynamic.partition.mode", "nonstrict");
// 对生成的临时表进行去重
spark.sql(GetExecuteSql.getInsertFocusDataMapAddItemGdSql(source));

}

posted @   15375357604  阅读(19)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示