一、依赖
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.dajiangtai</groupId> <artifactId>learning-flink</artifactId> <version>1.0</version> <packaging>jar</packaging> <name>Flink Quickstart Job</name> <url>http://www.myorganization.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <flink.version>1.6.2</flink.version> <java.version>1.8</java.version> <scala.binary.version>2.11</scala.binary.version> <maven.compiler.source>${java.version}</maven.compiler.source> <maven.compiler.target>${java.version}</maven.compiler.target> </properties> <repositories> <repository> <id>apache.snapshots</id> <name>Apache Development Snapshot Repository</name> <url>https://repository.apache.org/content/repositories/snapshots/</url> <releases> <enabled>false</enabled> </releases> <snapshots> <enabled>true</enabled> </snapshots> </repository> </repositories> <dependencies> <!-- Apache Flink dependencies --> <!-- These dependencies are provided, because they should not be packaged into the JAR file. --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-scala_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka-0.10 --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.10_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.bahir/flink-connector-redis --> <dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-hadoop-compatibility_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-hbase_${scala.binary.version}</artifactId> <version>${flink.version}</version> <exclusions> <exclusion> <artifactId>protobuf-java</artifactId> <groupId>com.google.protobuf</groupId> </exclusion> </exclusions> </dependency> <!--************************sql***************************--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-json --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-json</artifactId> <version>${flink.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-jdbc --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-jdbc</artifactId> <version>${flink.version}</version> </dependency> <!--************************sql***************************--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.35</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.54</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>com.google.protobuf</groupId> <artifactId>protobuf-java</artifactId> <version>2.5.0</version> </dependency> <dependency> <groupId>com.alibaba.otter</groupId> <artifactId>canal.client</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>com.cloudwise.toushibao</groupId> <artifactId>simulatedata-generator</artifactId> <version>0.0.1</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.7</version> <scope>runtime</scope> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> <scope>runtime</scope> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-shaded-hadoop2</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> </dependencies> <build> <plugins> <!-- Java Compiler --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>${java.version}</source> <target>${java.version}</target> </configuration> </plugin> <!-- scala编译插件 --> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.1.6</version> <configuration> <scalaCompatVersion>2.11</scalaCompatVersion> <scalaVersion>2.11.12</scalaVersion> <encoding>UTF-8</encoding> </configuration> <executions> <execution> <id>compile-scala</id> <phase>compile</phase> <goals> <goal>add-source</goal> <goal>compile</goal> </goals> </execution> <execution> <id>test-compile-scala</id> <phase>test-compile</phase> <goals> <goal>add-source</goal> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. --> <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.0.0</version> <executions> <!-- Run shade goal on package phase --> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <artifactSet> <excludes> <exclude>org.apache.flink:force-shading</exclude> <exclude>com.google.code.findbugs:jsr305</exclude> <exclude>org.slf4j:*</exclude> <exclude>log4j:*</exclude> </excludes> </artifactSet> <filters> <filter> <!-- Do not copy the signatures in the META-INF folder. Otherwise, this might cause SecurityExceptions when using the JAR. --> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.dajiangtai.StreamingJob</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> <pluginManagement> <plugins> <!-- This improves the out-of-the-box experience in Eclipse by resolving some warnings. --> <plugin> <groupId>org.eclipse.m2e</groupId> <artifactId>lifecycle-mapping</artifactId> <version>1.0.0</version> <configuration> <lifecycleMappingMetadata> <pluginExecutions> <pluginExecution> <pluginExecutionFilter> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <versionRange>[3.0.0,)</versionRange> <goals> <goal>shade</goal> </goals> </pluginExecutionFilter> <action> <ignore/> </action> </pluginExecution> <pluginExecution> <pluginExecutionFilter> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <versionRange>[3.1,)</versionRange> <goals> <goal>testCompile</goal> <goal>compile</goal> </goals> </pluginExecutionFilter> <action> <ignore/> </action> </pluginExecution> </pluginExecutions> </lifecycleMappingMetadata> </configuration> </plugin> </plugins> </pluginManagement> </build> <!-- This profile helps to make things run out of the box in IntelliJ --> <!-- Its adds Flink's core classes to the runtime class path. --> <!-- Otherwise they are missing in IntelliJ, because the dependency is 'provided' --> <profiles> <profile> <id>add-dependencies-for-IDEA</id> <activation> <property> <name>idea.version</name> </property> </activation> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_${scala.binary.version}</artifactId> <version>${flink.version}</version> <scope>compile</scope> </dependency> </dependencies> </profile> </profiles> </project>
二、Flink DataSet API编程指南
Flink最大的亮点是实时处理部分,Flink认为批处理是流处理的特殊情况,可以通过一套引擎处理批量和流式数据,而Flink在未来也会重点投入更多的资源到批流融合中。我在Flink DataStream API编程指南中介绍了DataStream API的使用,在本文中将介绍Flink批处理计算的DataSet API的使用。通过本文你可以了解:
-
DataSet转换操作(Transformation)
-
Source与Sink的使用
-
广播变量的基本概念与使用Demo
-
分布式缓存的概念及使用Demo
-
DataSet API的Transformation使用Demo案例
正文开始hē hē hē hē
1、WordCount示例
在开始讲解DataSet API之前,先看一个Word Count的简单示例,来直观感受一下DataSet API的编程模型,具体代码如下:
public class WordCount {
public static void main(String[] args) throws Exception {
// 用于批处理的执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// 数据源
DataSource<String> stringDataSource = env.fromElements("hello Flink What is Apache Flink");
// 转换
AggregateOperator<Tuple2<String, Integer>> wordCnt = stringDataSource
.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
从上面的示例中可以看出,基本的编程模型是:
-
获取批处理的执行环境ExecutionEnvironment
-
加载数据源
-
转换操作
-
数据输出
下面会对数据源、转换操作、数据输出进行一一解读。
2、Data Source
DataSet API支持从多种数据源中将批量数据集读到Flink系统中,并转换成DataSet数据集。主要包括三种类型:分别是基于文件的、基于集合的及通用类数据源。同时在DataSet API中可以自定义实现InputFormat/RichInputFormat接口,以接入不同数据格式类型的数据源,比如CsvInputFormat、TextInputFormat等。从ExecutionEnvironment类提供的方法中可以看出支持的数据源方法,如下图所示:
1、基于文件的数据源
readTextFile(path) / TextInputFormat
-
解释
读取文本文件,传递文件路径参数,并将文件内容转换成DataSet类型数据集。
-
使用
// 读取本地文件
DataSet<String> localLines = env.readTextFile("file:///path/to/my/textfile");
// 读取HDSF文件
DataSet<String> hdfsLines = env.readTextFile("hdfs://nnHost:nnPort/path/to/my/textfile");
readTextFileWithValue(path)/ TextValueInputFormat
-
解释
读取文本文件内容,将文件内容转换成DataSet[StringValue]类型数据集。该方法与readTextFile(String)不同的是,其泛型是StringValue,是一种可变的String类型,通过StringValue存储文本数据可以有效降低String对象创建数量,减小垃圾回收的压力。
-
使用
// 读取本地文件
DataSet<StringValue> localLines = env.readTextFileWithValue("file:///some/local/file");
// 读取HDSF文件
DataSet<StringValue> hdfsLines = env.readTextFileWithValue("hdfs://host:port/file/path");
readCsvFile(path)/ CsvInputFormat
-
解释
创建一个CSV的reader,读取逗号分隔(或其他分隔符)的文件。可以直接转换成Tuple类型、POJOs类的DataSet。在方法中可以指定行切割符、列切割符、字段等信息。
-
使用
// read a CSV file with five fields, taking only two of them
// 读取一个具有5个字段的CSV文件,只取第一个和第四个字段
DataSet<Tuple2<String, Double>> csvInput = env.readCsvFile("hdfs:///the/CSV/file")
.includeFields("10010")
.types(String.class, Double.class);
// 读取一个有三个字段的CSV文件,将其转为POJO类型
DataSet<Person>> csvInput = env.readCsvFile("hdfs:///the/CSV/file")
.pojoType(Person.class, "name", "age", "zipcode");
readFileOfPrimitives(path, Class) / PrimitiveInputFormat
-
解释
读取一个原始数据类型(如String,Integer)的文件,返回一个对应的原始类型的DataSet集合
-
使用
DataSet<String> Data = env.readFileOfPrimitives("file:///some/local/file", String.class);
2、基于集合的数据源
fromCollection(Collection)
-
解释
从java的集合中创建DataSet数据集,集合中的元素数据类型相同
-
使用
DataSet<String> data= env.fromCollection(arrayList);
fromElements(T ...)
-
解释
从给定数据元素序列中创建DataSet数据集,且所有的数据对象类型必须一致
-
使用
DataSet<String> stringDataSource = env.fromElements("hello Flink What is Apache Flink");
generateSequence(from, to)
-
解释
指定from到to范围区间,然后在区间内部生成数字序列数据集,由于是并行处理的,所以最终的顺序不能保证一致。
-
使用
DataSet<Long> longDataSource = env.generateSequence(1, 20);
3、通用类型数据源
DataSet API中提供了Inputformat通用的数据接口,以接入不同数据源和格式类型的数据。InputFormat接口主要分为两种类型:一种是基于文件类型,在DataSet API对应readFile()方法;另外一种是基于通用数据类型的接口,例如读取RDBMS或NoSQL数据库中等,在DataSet API中对应createInput()方法。
readFile(inputFormat, path) / FileInputFormat
-
解释
自定义文件类型输入源,将指定格式文件读取并转成DataSet数据集
-
使用
env.readFile(new MyInputFormat(), "file:///some/local/file");
createInput(inputFormat) / InputFormat
-
解释
自定义通用型数据源,将读取的数据转换为DataSet数据集。如以下实例使用Flink内置的JDBCInputFormat,创建读取mysql数据源的JDBCInput Format,完成从mysql中读取Person表,并转换成DataSet [Row]数据集
-
使用
DataSet<Tuple2<String, Integer> dbData =
env.createInput(
JDBCInputFormat.buildJDBCInputFormat()
.setDrivername("com.mysql.jdbc.Driver")
.setDBUrl("jdbc:mysql://localhost/mydb")
.setQuery("select name, age from stu")
.setRowTypeInfo(new RowTypeInfo(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO))
.finish()
);
3、Data Sink
Flink在DataSet API中的数据输出共分为三种类型。第一种是基于文件实现,对应DataSet的write()方法,实现将DataSet数据输出到文件系统中。第二种是基于通用存储介质实现,对应DataSet的output()方法,例如使用JDBCOutputFormat将数据输出到关系型数据库中。最后一种是客户端输出,直接将DataSet数据从不同的节点收集到Client,并在客户端中输出,例如DataSet的print()方法。
• writeAsText() / TextOutputFormat:以字符串的形式逐行写入元素。字符串是通过调用每个
元素的toString()方法获得的
• writeAsFormattedText() / TextOutputFormat:以字符串的形式逐行写入元素。字符串是通
过为每个元素调用用户定义的format()方法获得的。
• writeAsCsv(...) / CsvOutputFormat:将元组写入以逗号分隔的文件。行和字段分隔符是可
配置的。每个字段的值来自对象的toString()方法。
• print() / printToErr() / print(String msg) / printToErr(String msg) ()(注: 线上应用杜绝使用,
采用抽样打印或者日志的方式)
• write() / FileOutputFormat
• output()/ OutputFormat:通用的输出方法,用于不基于文件的数据接收器(如将结果存储在
数据库中)。
1、标准的数据输出方法
// 文本数据
DataSet<String> textData = // [...]
// 将数据写入本地文件
textData.writeAsText("file:///my/result/on/localFS");
// 将数据写入HDFS文件
textData.writeAsText("hdfs://nnHost:nnPort/my/result/on/localFS");
// 写数据到本地文件,如果文件存在则覆盖
textData.writeAsText("file:///my/result/on/localFS", WriteMode.OVERWRITE);
// 将数据输出到本地的CSV文件,指定分隔符为"|"
DataSet<Tuple3<String, Integer, Double>> values = // [...]
values.writeAsCsv("file:///path/to/the/result/file", "\n", "|");
// 使用自定义的TextFormatter对象
values.writeAsFormattedText("file:///path/to/the/result/file",
new TextFormatter<Tuple2<Integer, Integer>>() {
public String format (Tuple2<Integer, Integer> value) {
return value.f1 + " - " + value.f0;
}
});
2、使用自定义的输出类型
DataSet<Tuple3<String, Integer, Double>> myResult = [...]
// 将tuple类型的数据写入关系型数据库
myResult.output(
// 创建并配置OutputFormat
JDBCOutputFormat.buildJDBCOutputFormat()
.setDrivername("com.mysql.jdbc.Driver")
.setDBUrl("jdbc:mysql://localhost/mydb")
.setQuery("insert into persons (name, age, height) values (?,?,?)")
.finish()
);
3、 Flink 写入数据到 ElasticSearch
依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch6_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
ElasticSearchSinkUtil 工具类
public class ElasticSearchSinkUtil {
/**
* es sink
*
* @param hosts es hosts
* @param bulkFlushMaxActions bulk flush size
* @param parallelism 并行数
* @param data 数据
* @param func
* @param <T>
*/
public static <T> void addSink(List<HttpHost> hosts, int bulkFlushMaxActions, int parallelism,
SingleOutputStreamOperator<T> data, ElasticsearchSinkFunction<T> func) {
ElasticsearchSink.Builder<T> esSinkBuilder = new ElasticsearchSink.Builder<>(hosts, func);
esSinkBuilder.setBulkFlushMaxActions(bulkFlushMaxActions);
data.addSink(esSinkBuilder.build()).setParallelism(parallelism);
}
/**
* 解析配置文件的 es hosts
*
* @param hosts
* @return
* @throws MalformedURLException
*/
public static List<HttpHost> getEsAddresses(String hosts) throws MalformedURLException {
String[] hostList = hosts.split(",");
List<HttpHost> addresses = new ArrayList<>();
for (String host : hostList) {
if (host.startsWith("http")) {
URL url = new URL(host);
addresses.add(new HttpHost(url.getHost(), url.getPort()));
} else {
String[] parts = host.split(":", 2);
if (parts.length > 1) {
addresses.add(new HttpHost(parts[0], Integer.parseInt(parts[1])));
} else {
throw new MalformedURLException("invalid elasticsearch hosts format");
}
}
}
return addresses;
}
}
Main 启动类
public class Main {
public static void main(String[] args) throws Exception {
//获取所有参数
final ParameterTool parameterTool = ExecutionEnvUtil.createParameterTool(args);
//准备好环境
StreamExecutionEnvironment env = ExecutionEnvUtil.prepare(parameterTool);
//从kafka读取数据
DataStreamSource<Metrics> data = KafkaConfigUtil.buildSource(env);
//从配置文件中读取 es 的地址
List<HttpHost> esAddresses = ElasticSearchSinkUtil.getEsAddresses(parameterTool.get(ELASTICSEARCH_HOSTS));
//从配置文件中读取 bulk flush size,代表一次批处理的数量,这个可是性能调优参数,特别提醒
int bulkSize = parameterTool.getInt(ELASTICSEARCH_BULK_FLUSH_MAX_ACTIONS, 40);
//从配置文件中读取并行 sink 数,这个也是性能调优参数,特别提醒,这样才能够更快的消费,防止 kafka 数据堆积
int sinkParallelism = parameterTool.getInt(STREAM_SINK_PARALLELISM, 5);
//自己再自带的 es sink 上一层封装了下
ElasticSearchSinkUtil.addSink(esAddresses, bulkSize, sinkParallelism, data,
(Metrics metric, RuntimeContext runtimeContext, RequestIndexer requestIndexer) -> {
requestIndexer.add(Requests.indexRequest()
.index(ZHISHENG + "_" + metric.getName()) //es 索引名
.type(ZHISHENG) //es type
.source(GsonUtil.toJSONBytes(metric), XContentType.JSON));
});
env.execute("flink learning connectors es6");
/**
DataStream<String> input = ...;
input.addSink(new ElasticsearchSink<>(
config, transportAddresses,
new ElasticsearchSinkFunction<String>() {...},
new ActionRequestFailureHandler() {
@Override
void onFailure(ActionRequest action,
Throwable failure,
int restStatusCode,
RequestIndexer indexer) throw Throwable {
if (ExceptionUtils.containsThrowable(failure, EsRejectedExecutionException.class)) {
// full queue; re-add document for indexing
indexer.add(action);
} else if (ExceptionUtils.containsThrowable(failure, ElasticsearchParseException.class)) {
// malformed document; simply drop request without failing sink
} else {
// for all other failures, fail the sink
// here the failure is simply rethrown, but users can also choose to throw custom exceptions
throw failure;
}
}
}));
**/
}
}
配置文件
kafka.brokers=localhost:9092
kafka.group.id=zhisheng-metrics-group-test
kafka.zookeeper.connect=localhost:2181
metrics.topic=zhisheng-metrics
stream.parallelism=5
stream.checkpoint.interval=1000
stream.checkpoint.enable=false
elasticsearch.hosts=localhost:9200
elasticsearch.bulk.flush.max.actions=40
stream.sink.parallelism=5
配置优化
1、bulk.flush.backoff.enable 用来表示是否开启重试机制
2、bulk.flush.backoff.type 重试策略,有两种:EXPONENTIAL 指数型(表示多次重试之间的时间间隔按照指数方式进行增长)、CONSTANT 常数型(表示多次重试之间的时间间隔为固定常数)
3、bulk.flush.backoff.delay 进行重试的时间间隔
4、bulk.flush.backoff.retries 失败重试的次数
5、bulk.flush.max.actions: 批量写入时的最大写入条数
6、bulk.flush.max.size.mb: 批量写入时的最大数据量
7、bulk.flush.interval.ms: 批量写入的时间间隔,配置后则会按照该时间间隔严格执行,无视上面的两个批量写入配置
3、Flink 写入数据到 Kafka
依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
配置文件
kafka.brokers=xxx:9092,xxx:9092,xxx:9092
kafka.group.id=metrics-group-test
kafka.zookeeper.connect=xxx:2181
metrics.topic=xxx
stream.parallelism=5
kafka.sink.brokers=localhost:9092
kafka.sink.topic=metric-test
stream.checkpoint.interval=1000
stream.checkpoint.enable=false
stream.sink.parallelism=5
代码:
public class Main {
public static void main(String[] args) throws Exception{
final ParameterTool parameterTool = ExecutionEnvUtil.createParameterTool(args);
StreamExecutionEnvironment env = ExecutionEnvUtil.prepare(parameterTool);
DataStreamSource<Metrics> data = KafkaConfigUtil.buildSource(env);
data.addSink(new FlinkKafkaProducer011<Metrics>(
parameterTool.get("kafka.sink.brokers"),
parameterTool.get("kafka.sink.topic"),
new MetricSchema()
)).name("flink-connectors-kafka")
.setParallelism(parameterTool.getInt("stream.sink.parallelism"));
env.execute("flink learning connectors kafka");
}
}
执行命令可以查看该 topic 的信息:
bin/kafka-topics.sh --describe --zookeeper localhost:2181 --topic metric-test
4、Flink 读取 Kafka 数据批量写入到 MySQL
依赖
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.34</version>
</dependency>
读取 kafka 数据
package com.zhisheng.connectors.mysql.utils;
import com.zhisheng.common.utils.GsonUtil;
import com.zhisheng.connectors.mysql.model.Student;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.Properties;
/**
* Desc: 往kafka中写数据,可以使用这个main函数进行测试
* Created by zhisheng on 2019-02-17
* Blog: http://www.54tianzhisheng.cn/tags/Flink/
*/
public class KafkaUtil {
public static final String broker_list = "localhost:9092";
public static final String topic = "student"; //kafka topic 需要和 flink 程序用同一个 topic
public static void writeToKafka() throws InterruptedException {
Properties props = new Properties();
props.put("bootstrap.servers", broker_list);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
KafkaProducer producer = new KafkaProducer<String, String>(props);
for (int i = 1; i <= 100; i++) {
Student student = new Student(i, "zhisheng" + i, "password" + i, 18 + i);
ProducerRecord record = new ProducerRecord<String, String>(topic, null, null, GsonUtil.toJson(student));
producer.send(record);
System.out.println("发送数据: " + GsonUtil.toJson(student));
Thread.sleep(10 * 1000); //发送一条数据 sleep 10s,相当于 1 分钟 6 条
}
producer.flush();
}
public static void main(String[] args) throws InterruptedException {
writeToKafka();
}
}
写入数据库
依赖:
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
<version>2.1.1</version>
</dependency>
MySQL工具类
package com.zhisheng.connectors.mysql.sinks;
import com.zhisheng.connectors.mysql.model.Student;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.List;
/**
* Desc: 数据批量 sink 数据到 mysql
* Created by zhisheng_tian on 2019-02-17
* Blog: http://www.54tianzhisheng.cn/tags/Flink/
*/
public class SinkToMySQL extends RichSinkFunction<List<Student>> {
PreparedStatement ps;
BasicDataSource dataSource;
private Connection connection;
/**
* open() 方法中建立连接,这样不用每次 invoke 的时候都要建立连接和释放连接
*
* @param parameters
* @throws Exception
*/