SPARK中使用到的工具

1)sc.textFile读取小文件时,调用的时hadoopFile,使用的是textInputFormat,除了使用sc.wholeTextFiles还可以自定义CombineFileInputFormat

// sc.textFile调用的是 sc.hadoopFile,但是InputFormat是TextInpuFormat,固定的
// 可以调sc.hadoopFile,传入自己选择InputFormat(CombineFIleInputFormat)来解决大量小文件的问题
val rdd2 = sc.hadoopFile("path", classOf[CombineFileInputFormat[LongWritable, Text]], classOf[LongWritable], classOf[Text], 2)

 

2)ip2region:使用的是开源的ip2region.db

package com.bangdao.bigdata.utils;

import java.io.*;

/**
 * @author Shydow
 * @date 2021-12-29
 */
public class FileUtil {

    public static byte[] readFileToByteArray(String path) throws IOException {
        File file = new File(path);
        FileInputStream fis = new FileInputStream(file);
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        byte[] b = new byte[1024];
        int len = -1;
        while ((len = fis.read(b)) != -1) {
            bos.write(b, 0, len);
        }
        return bos.toByteArray();
    }

}
// 在spark sql
val url: URL = GeoHashAndIpRegion.getClass.getClassLoader.getResource("ip2region.db") val bytes: Array[Byte] = FileUtil.readFileToByteArray(url.getPath) val func: String => String = (ip: String) => {   val config = new DbConfig() val searcher = new DbSearcher(config, bytes) searcher.memorySearch(ip).getRegion
} spark.udf.register(
"ip2region", func);
// 在spark rdd
val bytes: Array[Byte] = Files.readAllBytes(Paths.get("ip2region.db"))
val bc2: Broadcast[Array[Byte]] = sc.broadcast(bytes)

source.rdd.map {row =>
   // 构造ip地址搜索器
   val ipBytes = bc2.value
   val config = new DbConfig()
   val searcher = new DbSearcher(config, ipBytes)
   // query
   val ip: String = row.getAs[String]("ip")
   val region: String = searcher.memorySearch(ip).getRegion
   (row, region)
}.foreach(println)

 

3)geohash2region

// geohash地址转换
val map: collection.Map[String, Area] = sc.textFile("area_with_geohash.csv")   .map { line => val arr: Array[String] = line.split(",") (arr(8), Area(arr(2), arr(3), arr(4), arr(5), arr(8))) }.collectAsMap()
val bc: Broadcast[collection.Map[String, Area]]
= sc.broadcast(map)
source.rdd .map { row
=>   val lon: Double = row.getAs[Double]("lon") val lat: Double = row.getAs[Double]("lat") // 构建geohash val geohash: String = GeoHash.geoHashStringWithCharacterPrecision(lat, lon, 5) val dict: collection.Map[String, Area] = bc.value val maybeArea: Option[Area] = dict.get(geohash) var province, city: String = null if (maybeArea.isDefined) { province = maybeArea.get.province city = maybeArea.get.city } (row.getAs[String]("uuid"), province, city) }

 

posted @ 2021-12-23 10:19  Shydow  阅读(199)  评论(0编辑  收藏  举报