基于Java使用Flink读取CSV文件,针对批处理,多表联合两种方式Table类和Join方法的实现数据处理,再入CSV文件

Maven依赖

源头

<dependencies>
	<dependency>
		<groupId>org.projectlombok</groupId>
		<artifactId>lombok</artifactId>
		<version>1.18.8</version>
	</dependency>

	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-table-planner_2.11</artifactId>
		<version>1.8.0</version>
	</dependency>

	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-table-api-java-bridge_2.11</artifactId>
		<version>1.8.0</version>
	</dependency>

	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-streaming-scala_2.11</artifactId>
		<version>1.8.0</version>
	</dependency>

	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-table-common</artifactId>
		<version>1.8.0</version>
	</dependency>
</dependencies>

改版

    <dependencies>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.8</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.8.0</version>
        </dependency>
    </dependencies>

SQL语句

SELECT COUNT(*) FROM T13_REF_AIRPORT_SAT;--11008
--HUB_ID IATA_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_AIRPORT_CITY_LINK;--9676
--*******LINK_ID AIRPORT_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_CITY_SAT;--9624
--HUB_ID CITY_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_CITY_COUNTRY_LINK;--9062
--*******LINK_ID COUNTRY_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_COUNTRY_SAT;--356
--HUB_ID  COUNTRY_CD NAME_CN NAME_EN

SELECT * 
	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
		AND X2.CITY_HUB_ID=X3.HUB_ID
		AND X3.HUB_ID=X4.CITY_HUB_ID
		AND X4.COUNTRY_HUB_ID=X5.HUB_ID;

SELECT COUNT(*) 
	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
		AND X2.CITY_HUB_ID=X3.HUB_ID
		AND X3.HUB_ID=X4.CITY_HUB_ID
		AND X4.COUNTRY_HUB_ID=X5.HUB_ID;--16759

SELECT X5.NAME_CN COUNTRY_CN_NAME,COUNT(X1.HUB_ID) COUNT_AIRPORT
	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
		AND X2.CITY_HUB_ID=X3.HUB_ID
		AND X3.HUB_ID=X4.CITY_HUB_ID
		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
	GROUP BY X5.NAME_CN
	ORDER BY COUNT_AIRPORT DESC;--254

SELECT 
	X5.COUNTRY_CD,
	X5.NAME_CN COUNTRY_NAME_CN,
	X5.NAME_EN COUNTRY_NAME_EN,
	X3.CITY_CD,
	X3.NAME_CN CITY_CN_NAME,
	X3.NAME_EN CITY_EN_NAME,
COUNT(X1.HUB_ID) COUNT_AIRPORT
	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
		AND X2.CITY_HUB_ID=X3.HUB_ID
		AND X3.HUB_ID=X4.CITY_HUB_ID
		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
	GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
	ORDER BY COUNT_AIRPORT DESC;--13030

SELECT 
		X5.COUNTRY_CD,
		X5.NAME_CN COUNTRY_NAME_CN,
		X5.NAME_EN COUNTRY_NAME_EN,
		X3.CITY_CD,
		X3.NAME_CN CITY_CN_NAME,
		X3.NAME_EN CITY_EN_NAME,
		COUNT(X1.HUB_ID) COUNT_AIRPORT
	FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
	WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
		AND X2.CITY_HUB_ID=X3.HUB_ID
		AND X3.HUB_ID=X4.CITY_HUB_ID
		AND X4.COUNTRY_HUB_ID=X5.HUB_ID
		AND X3.NAME_EN IS NULL
	GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
	ORDER BY COUNT_AIRPORT DESC;

--COUNTRY_NAME_EN=NULL 19
--CITY_CN_NAME=NULL 1
--CITY_EN_NAME=NULL 1501

Airport_Sat

import lombok.Data;

@Data
public class AirportSat
{
    private String hub_id;
}

Airport_City_Link

import lombok.Data;

@Data
public class AirportCityLink
{
    private String airport_hub_id;
    private String city_hub_id;
}

City_Sat

import lombok.Data;

@Data
public class CitySat
{
    private String hub_id;
    private String city_cd;
    private String name_cn;
    private String name_en;
}

City_Country_Link

import lombok.Data;

@Data
public class CityCountryLink
{
    private String country_hub_id;
    private String city_hub_id;
}

Country_Sat

import lombok.Data;

@Data
public class CountrySat
{
    private String hub_id;
    private String country_cd;
    private String name_cn;
    private String name_en;
}

Flink_Csv

点击查看Flink_Csv代码
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple7;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;

import java.text.SimpleDateFormat;
import java.util.Date;

public class FlinkCsv
{
    public static void main(String[] args) throws Exception
    {
		long s4 = System.currentTimeMillis();
		t4();
		System.out.println((System.currentTimeMillis() - s4) + "u");
		long s5 = System.currentTimeMillis();
		t5();
		System.out.println((System.currentTimeMillis() - s5) + "d");
    }

    private static void t5() throws Exception
    {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        BatchTableEnvironment table_env = BatchTableEnvironment.getTableEnvironment(env);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

        DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

        DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

        DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

        DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

        DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

        table_env.registerTable("t13_ref_airport_sat", table_env.fromDataSet(data_airportsat));
        table_env.registerTable("t13_ref_airport_city_link", table_env.fromDataSet(data_airportcitylink));
        table_env.registerTable("t13_ref_city_sat", table_env.fromDataSet(data_citysat));
        table_env.registerTable("t13_ref_city_country_link", table_env.fromDataSet(data_citycountrylink));
        table_env.registerTable("t13_ref_country_sat", table_env.fromDataSet(data_countrysat));


        String sql = "select count(*) \n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
                "\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id";

        String sql_country = "select x5.name_cn country_cn_name,count(x1.hub_id) count_airport\n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
                "\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id\n" +
                "\tgroup by x5.name_cn\n" +
                "\torder by count_airport desc";

        String sql_all = "select \n" +
                "\tx5.country_cd,\n" +
                "\tx5.name_cn country_name_cn,\n" +
                "\tx5.name_en country_name_en,\n" +
                "\tx3.city_cd,\n" +
                "\tx3.name_cn city_cn_name,\n" +
                "\tx3.name_en city_en_name,\n" +
                "count(x1.hub_id) count_airport\n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id\n" +
                "\tgroup by x5.country_cd,x5.name_cn,x5.name_en,x3.city_cd,x3.name_cn,x3.name_en\n" +
                "\torder by count_airport desc";


        DataSet<Tuple1<Long>> map = table_env.toDataSet(table_env.sqlQuery(sql),
                TypeInformation.of(new TypeHint<Tuple1<Long>>()
                {
                }));
        map.print();

        DataSet<Tuple2<String, Long>> map_country = table_env.toDataSet(table_env.sqlQuery(sql_country),
                TypeInformation.of(new TypeHint<Tuple2<String, Long>>()
                {
                }));
        System.out.println(map_country.count());
        map_country.print();

        Table result_country = table_env.sqlQuery(sql_country);
        DataSet<Tuple7<String, String, String, String, String, String, Long>> map_all = table_env.toDataSet(table_env.sqlQuery(sql_all),
                TypeInformation.of(new TypeHint<Tuple7<String, String, String, String, String, String, Long>>()
                {
                }));
        System.out.println(map_all.count());
        map_all.print();

        map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map...");
        map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map_country...");
        map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map_all...");

        env.execute("Hello!@ Fuck...");
    }

    private static void t4() throws Exception
    {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

        DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

        DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

        DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

        DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

        DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

        MapOperator<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                Tuple7<String, String, String, String, String, String, Long>> map = data_airportsat
                .join(data_airportcitylink).where("hub_id").equalTo("airport_hub_id")
                .join(data_citysat).where(new KeySelector<Tuple2<AirportSat, AirportCityLink>, String>()
                {
                    @Override
                    public String getKey(Tuple2<AirportSat, AirportCityLink> t) throws Exception
                    {
                        return t.f1.getCity_hub_id();
                    }
                }).equalTo("hub_id")
                .join(data_citycountrylink).where(new KeySelector<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, String>()
                {
                    @Override
                    public String getKey(Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat> t) throws Exception
                    {
                        return t.f1.getHub_id();
                    }
                }).equalTo("city_hub_id")
                .join(data_countrysat).where(new KeySelector<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, String>()
                {
                    @Override
                    public String getKey(Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink> t) throws Exception
                    {
                        return t.f1.getCountry_hub_id();
                    }
                }).equalTo("hub_id")
                .map(new MapFunction<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                        Tuple7<String, String, String, String, String, String, Long>>()
                {

                    @Override
                    public Tuple7<String, String, String, String, String, String, Long> map(
                            Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat> t) throws Exception
                    {
                        String country_cd = t.f1.getCountry_cd();
                        String country_cn_name = t.f1.getName_cn();
                        String country_en_name = t.f1.getName_en();
                        String city_cd = t.f0.f0.f1.getCity_cd();
                        String city_cn_name = t.f0.f0.f1.getName_cn();
                        String city_en_name = t.f0.f0.f1.getName_en();
                        long airport = 1L;
                        return new Tuple7<>(country_cd, country_cn_name, country_en_name, city_cd, city_cn_name, city_en_name, airport);
                    }
                });
        //--------------------------------------------------------------------------------------------------------------
        System.out.println("总数量: " + map.count());
        SortPartitionOperator<Tuple2<String, Long>> map_country = map
                .map(new MapFunction<Tuple7<String, String, String, String, String, String, Long>, Tuple2<String, Long>>()
                {
                    @Override
                    public Tuple2<String, Long> map(Tuple7<String, String, String, String, String, String, Long> t) throws Exception
                    {
                        return new Tuple2<>(t.f1, t.f6);
                    }
                }).groupBy(0).sum(1).sortPartition(1, Order.DESCENDING);
        System.out.println("国家分总数量: " + map_country.count());
		//map_country.print();
        SortPartitionOperator<Tuple7<String, String, String, String, String, String, Long>> map_all = map
                .groupBy(0, 1, 2, 3, 4, 5).sum(6).sortPartition(6, Order.DESCENDING);
        System.out.println("全分总数量: " + map_all.count());
		//map_all.print();


        map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map...");
        map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map_country...");
        map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map_all...");

        env.execute("Hello!@ Fuck...");
    }
}

posted on 2019-11-28 15:18  陶攀峰  阅读(1691)  评论(0编辑  收藏  举报

顶部 底部