SPARK中使用到的工具
1)sc.textFile读取小文件时,调用的时hadoopFile,使用的是textInputFormat,除了使用sc.wholeTextFiles还可以自定义CombineFileInputFormat
// sc.textFile调用的是 sc.hadoopFile,但是InputFormat是TextInpuFormat,固定的 // 可以调sc.hadoopFile,传入自己选择InputFormat(CombineFIleInputFormat)来解决大量小文件的问题 val rdd2 = sc.hadoopFile("path", classOf[CombineFileInputFormat[LongWritable, Text]], classOf[LongWritable], classOf[Text], 2)
2)ip2region:使用的是开源的ip2region.db
package com.bangdao.bigdata.utils; import java.io.*; /** * @author Shydow * @date 2021-12-29 */ public class FileUtil { public static byte[] readFileToByteArray(String path) throws IOException { File file = new File(path); FileInputStream fis = new FileInputStream(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(); byte[] b = new byte[1024]; int len = -1; while ((len = fis.read(b)) != -1) { bos.write(b, 0, len); } return bos.toByteArray(); } }
// 在spark sql
val url: URL = GeoHashAndIpRegion.getClass.getClassLoader.getResource("ip2region.db") val bytes: Array[Byte] = FileUtil.readFileToByteArray(url.getPath) val func: String => String = (ip: String) => { val config = new DbConfig() val searcher = new DbSearcher(config, bytes) searcher.memorySearch(ip).getRegion
} spark.udf.register("ip2region", func);
// 在spark rdd val bytes: Array[Byte] = Files.readAllBytes(Paths.get("ip2region.db")) val bc2: Broadcast[Array[Byte]] = sc.broadcast(bytes) source.rdd.map {row => // 构造ip地址搜索器 val ipBytes = bc2.value val config = new DbConfig() val searcher = new DbSearcher(config, ipBytes) // query val ip: String = row.getAs[String]("ip") val region: String = searcher.memorySearch(ip).getRegion (row, region)
}.foreach(println)
3)geohash2region
// geohash地址转换
val map: collection.Map[String, Area] = sc.textFile("area_with_geohash.csv") .map { line => val arr: Array[String] = line.split(",") (arr(8), Area(arr(2), arr(3), arr(4), arr(5), arr(8))) }.collectAsMap()
val bc: Broadcast[collection.Map[String, Area]] = sc.broadcast(map)
source.rdd .map { row => val lon: Double = row.getAs[Double]("lon") val lat: Double = row.getAs[Double]("lat") // 构建geohash val geohash: String = GeoHash.geoHashStringWithCharacterPrecision(lat, lon, 5) val dict: collection.Map[String, Area] = bc.value val maybeArea: Option[Area] = dict.get(geohash) var province, city: String = null if (maybeArea.isDefined) { province = maybeArea.get.province city = maybeArea.get.city } (row.getAs[String]("uuid"), province, city) }