2024.6.12

sparksql对 json 和 csv文件的读取

public class SparkSQL05_Source_JSON_1 {
    public static void main(String[] args) {

        // TODO 构建环境对象
        //      Spark在结构化数据的处理场景中对核心功能,环境进行了封装
        //      构建SparkSQL的环境对象时,一般采用构建器模式
        //      构建器模式: 构建对象
        final SparkSession sparkSession = SparkSession
                .builder()
                .master("local[*]")
                .appName("SparkSQL")
                .getOrCreate();

        final Dataset<Row> csv = sparkSession.read().csv("data/user.csv");

        csv.write().json("output");
        csv.show();


        // TODO 释放资源
        sparkSession.close();

    }
}

UDF UDAF

public class MyCityRemarkUDAF extends Aggregator<String, MyCityRemarkBuffer, String> {
    @Override
    public MyCityRemarkBuffer zero() {
        return new MyCityRemarkBuffer(0L, new HashMap<String, Long>());
    }

    @Override
    // TODO 将函数的输入值和缓冲区的数据进行聚合处理
    public MyCityRemarkBuffer reduce(MyCityRemarkBuffer buffer, String city) {
        buffer.setCount( buffer.getCount() + 1 );
        final Map<String, Long> cityMap = buffer.getCityMap();
        final Long cityCount = cityMap.get(city);
        if ( cityCount == null ) {
            cityMap.put(city, 1L);
        } else {
            cityMap.put(city, cityCount + 1);
        }
        buffer.setCityMap(cityMap);
        return buffer;
    }

    @Override
    // TODO 合并缓冲区
    public MyCityRemarkBuffer merge(MyCityRemarkBuffer b1, MyCityRemarkBuffer b2) {

        b1.setCount( b1.getCount() + b2.getCount() );

        final Map<String, Long> map1 = b1.getCityMap();
        final Map<String, Long> map2 = b2.getCityMap();
        /*
           c1 : { beijing : 10, tianjin : 20, baoding : 30 }
           c2 : { beijing : 40, tianjin : 50, sjz : 60 }
           ------------------------------------------------
           c3 : { beijing : 50, tianjin : 70, baoding : 30, sjz : 60 }

           1. 将 map1 保持不变
           2. 对 map2 进行遍历
           3. 如果 map2 中的 key 在 map1 存在,那么合并数据
           4. 如果 map2 中的 key 不在 map1 存在,那么直接添加即可
         */
        final Iterator<String> iterator = map2.keySet().iterator();
        while ( iterator.hasNext() ) {
            final String key = iterator.next();

            final Long v1 = map1.get(key);
            final Long v2 = map2.get(key);
            if ( v1 == null ) {
                map1.put( key, v2 );
            } else {
                map1.put( key, v1 + v2 );
            }
        }
        b1.setCityMap(map1);
        return b1;
    }

    @Override
    public String finish(MyCityRemarkBuffer buffer) {
        StringBuilder ss = new StringBuilder();

        final Long total = buffer.getCount();
        final Map<String, Long> cityMap = buffer.getCityMap();
        List<CityCount> ccs = new ArrayList<CityCount>();
        cityMap.forEach(
            (k, v) -> {
                ccs.add( new CityCount(k, v) );
            }
        );
        //TODO 对List进行排序
        Collections.sort(ccs);

        final CityCount cityCount0 = ccs.get(0);
        final long pc0 = cityCount0.getCount() * 100 / total; // 10 * 100/20 => 50
        ss.append(cityCount0.getCityName() + " "+ pc0 +"%");


        final CityCount cityCount1 = ccs.get(1);
        final long pc1 = cityCount1.getCount() * 100 / total; // 10 * 100/20 => 50
        ss.append(cityCount1.getCityName() + " "+ pc1 +"%");

        if ( ccs.size() > 2 ) {
            ss.append("其他 "+(100 - pc0 - pc1)+"%");
        }

        return ss.toString();
    }

    @Override
    public Encoder<MyCityRemarkBuffer> bufferEncoder() {
        return Encoders.bean(MyCityRemarkBuffer.class);
    }

    @Override
    public Encoder<String> outputEncoder() {
        return Encoders.STRING();
    }
}

posted @ 2024-06-12 23:23  258333  阅读(28)  评论(0编辑  收藏  举报