大数据开发

SparkSession spark = SparkSession.builder()
.appName("spark-item")
.config("spark.sql.warehouse.dir", warehouse_location)
.enableHiveSupport()
.config("spark.sql.adaptive.enabled",true)
.config("spark.sql.hive.mergeFiles",true)
.config("spark.executor.heartbeatInterval", "18000")
.config("spark.network.timeout", "36000")
.config("spark.sql.broadcastTimeout", "1800")
.getOrCreate();
/**
*
*/

// 通过enconder对数据类进行Dataset转化
String reduplicatedListSql = GetExecuteSql.getFocusDataMapAddItemGdResultList(yesterday);
Encoder<ItemFocusDataMapItemGd> itemFocusDataMapItemGdEncoder = Encoders.bean(ItemFocusDataMapItemGd.class);
Dataset<ItemFocusDataMapItemGd> itemMixedEtlResultDataset = spark.sql(reduplicatedListSql).as(itemFocusDataMapItemGdEncoder);
Encoder<ItemMixedEtlResultTmp> itemMixedEtlResultTmpEncoder = Encoders.bean(ItemMixedEtlResultTmp.class);
Dataset<ItemMixedEtlResultTmp> itemMixedEtlDuplicatedResultDataset = itemMixedEtlResultDataset.repartition(repartitions).
mapPartitions(new MapPartitionFocusDataMapAddItem(),itemMixedEtlResultTmpEncoder);
itemMixedEtlDuplicatedResultDataset.createOrReplaceTempView("result_data");
spark.sqlContext().setConf("hive.merge.mapfiles","true");
spark.sqlContext().setConf("hive.merge.mapredfiles","true");
spark.sqlContext().setConf("hive.merge.size.per.task","167772160");
spark.sqlContext().setConf("hive.merge.smallfiles.avgsize","125829120");
spark.sqlContext().setConf("hive.exec.dynamic.partition", "true");
spark.sqlContext().setConf("hive.exec.dynamic.partition.mode", "nonstrict");
// 对生成的临时表进行去重
spark.sql(GetExecuteSql.getInsertFocusDataMapAddItemGdSql(source));

}

posted @ 2023-05-25 10:36  15375357604  阅读(11)  评论(0编辑  收藏  举报