DStream-02 Checkpoint的真面目
Dstream Checkpoint 的用法
object DstreamCheckpoint {
def main(args: Array[String]): Unit = {
val ssc = StreamingContext.getOrCreate("checkpoint_dir",functionToCreateContext)
ssc.sparkContext.setLogLevel("ERROR")
ssc.start()
ssc.awaitTermination()
}
def functionToCreateContext(): StreamingContext = {
println("functionToCreateContext invoke")
val sparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("DstreamCheckpoint")
val ssc = new StreamingContext(sparkConf,Durations.seconds(2))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "s1:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group_test",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("test_mxb")
val dstream = KafkaUtils.createDirectStream(ssc,PreferConsistent,Subscribe[String, String](topics, kafkaParams))
dstream.map(record => (record.key, record.value,record.partition(),record.offset()))
.foreachRDD(rdd => {
....
})
})
ssc.checkpoint("checkpoint_dir")
ssc
}
}
以上代码可以实现故障恢复和重启时回到之前的offset,但是如果对代码进行修改则无法进行回滚。
StreamingContext.getOrCreate("checkpoint_dir",functionToCreateContext) 是StreamingContext 的一个伴生对象的方法
Spark源码:
- 从checkpoint_dir 中读取 Checkpoint 对象,new StreamingContext ,反之读取不到 调用我们传入的 creatingFunc 函数去创建 StreamingContext 。 当使用Checkpoint 对象 去new StreamingContext ,会触发一些方法,然后去从 Checkpoint 对象恢复StreamingContext 中 SparkContext、DStreamGraph对象。
def getOrCreate(
checkpointPath: String,
creatingFunc: () => StreamingContext,
hadoopConf: Configuration = SparkHadoopUtil.get.conf,
createOnError: Boolean = false
): StreamingContext = {
val checkpointOption = CheckpointReader.read(
checkpointPath, new SparkConf(), hadoopConf, createOnError)
checkpointOption.map(new StreamingContext(null, _, null)).getOrElse(creatingFunc())
}
- 从 Checkpoint 对象恢复SparkContext、DStreamGraph
private[streaming] val sc: SparkContext = {
if (_sc != null) {
_sc
} else if (isCheckpointPresent) {
SparkContext.getOrCreate(_cp.createSparkConf())
} else {
throw new SparkException("Cannot create StreamingContext without a SparkContext")
}
}
private[streaming] val graph: DStreamGraph = {
if (isCheckpointPresent) {
_cp.graph.setContext(this)
_cp.graph.restoreCheckpointData()
_cp.graph
} else {
require(_batchDur != null, "Batch duration for StreamingContext cannot be null")
val newGraph = new DStreamGraph()
newGraph.setBatchDuration(_batchDur)
newGraph
}
}