Flink - ResultPartition
发送数据一般通过,collector.collect
public interface Collector<T> { /** * Emits a record. * * @param record The record to collect. */ void collect(T record); /** * Closes the collector. If any data was buffered, that data will be flushed. */ void close(); }
output继承,
public interface Output<T> extends Collector<T> { /** * Emits a {@link Watermark} from an operator. This watermark is broadcast to all downstream * operators. * * <p>A watermark specifies that no element with a timestamp lower or equal to the watermark * timestamp will be emitted in the future. */ void emitWatermark(Watermark mark); }
RecordWriterOutput
public class RecordWriterOutput<OUT> implements Output<StreamRecord<OUT>> { private StreamRecordWriter<SerializationDelegate<StreamElement>> recordWriter; private SerializationDelegate<StreamElement> serializationDelegate; @Override public void collect(StreamRecord<OUT> record) { serializationDelegate.setInstance(record); try { recordWriter.emit(serializationDelegate); } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } }
RecordWriter
public class RecordWriter<T extends IOReadableWritable> { protected final ResultPartitionWriter writer; //负责写入ResultPartition的writer private final ChannelSelector<T> channelSelector; //选择写入哪个channel,默认RoundRobinChannelSelector private final int numChannels; /** {@link RecordSerializer} per outgoing channel */ private final RecordSerializer<T>[] serializers; public RecordWriter(ResultPartitionWriter writer) { this(writer, new RoundRobinChannelSelector<T>()); } @SuppressWarnings("unchecked") public RecordWriter(ResultPartitionWriter writer, ChannelSelector<T> channelSelector) { this.writer = writer; this.channelSelector = channelSelector; this.numChannels = writer.getNumberOfOutputChannels(); //获取channel数 /** * The runtime exposes a channel abstraction for the produced results * (see {@link ChannelSelector}). Every channel has an independent * serializer. */ this.serializers = new SpanningRecordSerializer[numChannels]; for (int i = 0; i < numChannels; i++) { serializers[i] = new SpanningRecordSerializer<T>(); //为每个channel初始化Serializer } } public void emit(T record) throws IOException, InterruptedException { for (int targetChannel : channelSelector.selectChannels(record, numChannels)) { //对于选中的channels // serialize with corresponding serializer and send full buffer RecordSerializer<T> serializer = serializers[targetChannel]; synchronized (serializer) { //加锁,一条channel的serializer不能并发写 SerializationResult result = serializer.addRecord(record); while (result.isFullBuffer()) { //buffer,即memorySegment已满 Buffer buffer = serializer.getCurrentBuffer(); //将buffer取出 if (buffer != null) { writeBuffer(buffer, targetChannel, serializer); //将buffer写入 } buffer = writer.getBufferProvider().requestBufferBlocking(); //申请新的buffer result = serializer.setNextBuffer(buffer); //set新的buffer到serializer } } } }
writeBuffer
private void writeBuffer( Buffer buffer, int targetChannel, RecordSerializer<T> serializer) throws IOException { try { writer.writeBuffer(buffer, targetChannel); } finally { serializer.clearCurrentBuffer(); } }
可以看到写入和申请buffer都是通过ResultPartitionWriter
public final class ResultPartitionWriter implements EventListener<TaskEvent> { private final ResultPartition partition; //Result Partition private final TaskEventHandler taskEventHandler = new TaskEventHandler(); public ResultPartitionWriter(ResultPartition partition) { this.partition = partition; } // ------------------------------------------------------------------------ // Attributes // ------------------------------------------------------------------------ public ResultPartitionID getPartitionId() { return partition.getPartitionId(); } public BufferProvider getBufferProvider() { return partition.getBufferProvider(); } public int getNumberOfOutputChannels() { return partition.getNumberOfSubpartitions(); } // ------------------------------------------------------------------------ // Data processing // ------------------------------------------------------------------------ public void writeBuffer(Buffer buffer, int targetChannel) throws IOException { partition.add(buffer, targetChannel); } }
而ResultPartitionWriter操作都通过ResultPartition, writerBuffer只是把buffer,add到partition
ResultPartition
初始化的过程,
task初始化ResultPartition,
// Produced intermediate result partitions this.producedPartitions = new ResultPartition[partitions.size()]; this.writers = new ResultPartitionWriter[partitions.size()]; for (int i = 0; i < this.producedPartitions.length; i++) { ResultPartitionDeploymentDescriptor desc = partitions.get(i); ResultPartitionID partitionId = new ResultPartitionID(desc.getPartitionId(), executionId); this.producedPartitions[i] = new ResultPartition( taskNameWithSubtaskAndId, jobId, partitionId, desc.getPartitionType(), desc.getEagerlyDeployConsumers(), desc.getNumberOfSubpartitions(), networkEnvironment.getPartitionManager(), networkEnvironment.getPartitionConsumableNotifier(), ioManager, networkEnvironment.getDefaultIOMode()); this.writers[i] = new ResultPartitionWriter(this.producedPartitions[i]); }
在task.run中先到NetworkEnvironment中register,
network.registerTask(this);
这里做的主要的工作是,创建等同于subPartiton大小的localBuffer,并register到ResultPartition
bufferPool = networkBufferPool.createBufferPool(partition.getNumberOfSubpartitions(), false); //创建LocalPool,注意Reqired的segment数目是Subpartitions的数目,即一个subP一个segment partition.registerBufferPool(bufferPool); //把localPool注册到ResultPartition
所以,
writer.getBufferProvider().requestBufferBlocking();
就是调用localBufferPool.requestBuffer
如果有availableMemorySegments就直接用
如果没有,
if (numberOfRequestedMemorySegments < currentPoolSize) { final MemorySegment segment = networkBufferPool.requestMemorySegment(); //如果还有可申请的,就去networkBufferPool申请 if (segment != null) { numberOfRequestedMemorySegments++; availableMemorySegments.add(segment); continue; } } if (askToRecycle) { //如果不能申请新的,让owner去试图释放 owner.releaseMemory(1); } if (isBlocking) { //实在不行,blocking等2秒 availableMemorySegments.wait(2000); }
public void releaseMemory(int toRelease) throws IOException { for (ResultSubpartition subpartition : subpartitions) { toRelease -= subpartition.releaseMemory(); //让subpartition去releaseMemory // Only release as much memory as needed if (toRelease <= 0) { break; } } }
可以看到,如果在emit的时候,如果没有可用的segment,是会blocking等待的
对于pipelineSubpartition的release,什么都不会做,所以这里如果buffer没有被及时发送出去并回收,会不断的blocking等待
public int releaseMemory() { // The pipelined subpartition does not react to memory release requests. The buffers will be // recycled by the consuming task. return 0; }
ResultPartition.add
public void add(Buffer buffer, int subpartitionIndex) throws IOException { boolean success = false; try { checkInProduceState(); final ResultSubpartition subpartition = subpartitions[subpartitionIndex]; //取出index相应的ResultSubpartition synchronized (subpartition) { success = subpartition.add(buffer); //把buffer add到ResultSubpartition // Update statistics totalNumberOfBuffers++; totalNumberOfBytes += buffer.getSize(); } } finally { if (success) { notifyPipelinedConsumers(); //通知ResultPartitionConsumableNotifier触发notifyPartitionConsumable } else { buffer.recycle(); //失败,回收此buffer } } }
对于PipelinedSubpartition,add逻辑就是加入buffer
/** * A pipelined in-memory only subpartition, which can be consumed once. */ class PipelinedSubpartition extends ResultSubpartition { /** * A data availability listener. Registered, when the consuming task is faster than the * producing task. */ private NotificationListener registeredListener; //来数据后,通知consuming /** The read view to consume this subpartition. */ private PipelinedSubpartitionView readView; //read view /** All buffers of this subpartition. Access to the buffers is synchronized on this object. */ final ArrayDeque<Buffer> buffers = new ArrayDeque<Buffer>(); //buffer队列 PipelinedSubpartition(int index, ResultPartition parent) { super(index, parent); } @Override public boolean add(Buffer buffer) { checkNotNull(buffer); final NotificationListener listener; synchronized (buffers) { if (isReleased || isFinished) { return false; } // Add the buffer and update the stats buffers.add(buffer); //加入buffer队列 updateStatistics(buffer); // Get the listener... listener = registeredListener; registeredListener = null; } // Notify the listener outside of the synchronized block if (listener != null) { listener.onNotification(); //触发listener } return true; }
NettyConnectionManager
@Override public void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher, NetworkBufferPool networkbufferPool) throws IOException { PartitionRequestProtocol partitionRequestProtocol = new PartitionRequestProtocol(partitionProvider, taskEventDispatcher, networkbufferPool); client.init(partitionRequestProtocol, bufferPool); server.init(partitionRequestProtocol, bufferPool); }
PartitionRequestProtocol
// +-------------------------------------------------------------------+ // | SERVER CHANNEL PIPELINE | // | | // | +----------+----------+ (3) write +----------------------+ | // | | Queue of queues +----------->| Message encoder | | // | +----------+----------+ +-----------+----------+ | // | /|\ \|/ | // | | (2) enqueue | | // | +----------+----------+ | | // | | Request handler | | | // | +----------+----------+ | | // | /|\ | | // | | | | // | +----------+----------+ | | // | | Message decoder | | | // | +----------+----------+ | | // | /|\ | | // | | | | // | +----------+----------+ | | // | | Frame decoder | | | // | +----------+----------+ | | // | /|\ | | // +---------------+-----------------------------------+---------------+ // | | (1) client request \|/ // +---------------+-----------------------------------+---------------+ // | | | | // | [ Socket.read() ] [ Socket.write() ] | // | | // | Netty Internal I/O Threads (Transport Implementation) | // +-------------------------------------------------------------------+ @Override public ChannelHandler[] getServerChannelHandlers() { PartitionRequestQueue queueOfPartitionQueues = new PartitionRequestQueue(); PartitionRequestServerHandler serverHandler = new PartitionRequestServerHandler( partitionProvider, taskEventDispatcher, queueOfPartitionQueues, networkbufferPool); return new ChannelHandler[] { messageEncoder, createFrameLengthDecoder(), messageDecoder, serverHandler, queueOfPartitionQueues }; }
PartitionRequestServerHandler
ServerHandler会分配大小至少为1的bufferpool,因为后面是false,意思是如果networkbufferpool有多余的segment,会分配进来
public void channelRegistered(ChannelHandlerContext ctx) throws Exception { super.channelRegistered(ctx); bufferPool = networkBufferPool.createBufferPool(1, false); }
protected void channelRead0(ChannelHandlerContext ctx, NettyMessage msg) throws Exception
PartitionRequest request = (PartitionRequest) msg; LOG.debug("Read channel on {}: {}.", ctx.channel().localAddress(), request); try { ResultSubpartitionView subpartition = partitionProvider.createSubpartitionView( request.partitionId, request.queueIndex, bufferPool); outboundQueue.enqueue(subpartition, request.receiverId); //放入PartitionRequestQueue,进行发送 }
ResultPartitionManager继承自partitionProvider
调用ResultPartitionManager.createSubpartitionView
synchronized (registeredPartitions) { final ResultPartition partition = registeredPartitions.get(partitionId.getProducerId(), partitionId.getPartitionId()); return partition.createSubpartitionView(subpartitionIndex, bufferProvider); }
ResultPartition
public ResultSubpartitionView createSubpartitionView(int index, BufferProvider bufferProvider) throws IOException { int refCnt = pendingReferences.get(); checkState(refCnt != -1, "Partition released."); checkState(refCnt > 0, "Partition not pinned."); ResultSubpartitionView readView = subpartitions[index].createReadView(bufferProvider); return readView; }
pendingReferences的定义
/** * The total number of references to subpartitions of this result. The result partition can be * safely released, iff the reference count is zero. A reference count of -1 denotes that the * result partition has been released. */ private final AtomicInteger pendingReferences = new AtomicInteger();
PipelinedSubpartitionView
class PipelinedSubpartitionView implements ResultSubpartitionView { /** The subpartition this view belongs to. */ private final PipelinedSubpartition parent; /** Flag indicating whether this view has been released. */ private AtomicBoolean isReleased = new AtomicBoolean(); PipelinedSubpartitionView(PipelinedSubpartition parent) { this.parent = checkNotNull(parent); } @Override public Buffer getNextBuffer() { synchronized (parent.buffers) { return parent.buffers.poll(); //从parent,PipelinedSubpartition,的buffers里面直接poll } } @Override public boolean registerListener(NotificationListener listener) { return !isReleased.get() && parent.registerListener(listener); } @Override public void notifySubpartitionConsumed() { //消费完释放该Subpartition releaseAllResources(); } @Override public void releaseAllResources() { if (isReleased.compareAndSet(false, true)) { // The view doesn't hold any resources and the parent cannot be restarted. Therefore, // it's OK to notify about consumption as well. parent.onConsumedSubpartition(); } }
PartitionRequestQueue在发送数据的时候,会调用getNextBuffer获取数据
发送完,即收到EndOfPartitionEvent后,调用notifySubpartitionConsumed
释放会调用到,PipelinedSubpartition.onConsumedSubpartition –> ResultPartition.onConsumedSubpartition
void onConsumedSubpartition(int subpartitionIndex) { if (isReleased.get()) { //如果已经释放了 return; } int refCnt = pendingReferences.decrementAndGet(); //一个Subpartition消费完,减1 if (refCnt == 0) { //如果所有subPartition消费完 partitionManager.onConsumedPartition(this); //通知partitionManager,release这个partition } }
PipelinedSubpartition中的buffer,何时被释放,放回localbufferpool
具体看下,PartitionRequestQueue的发送过程,
writeAndFlushNextMessageIfPossible
buffer = currentPartitionQueue.getNextBuffer(); BufferResponse resp = new BufferResponse(buffer, currentPartitionQueue.getSequenceNumber(), currentPartitionQueue.getReceiverId()); //将buffer封装成BufferResponse if (!buffer.isBuffer() && EventSerializer.fromBuffer(buffer, getClass().getClassLoader()).getClass() == EndOfPartitionEvent.class) { //如果收到partition结束event currentPartitionQueue.notifySubpartitionConsumed(); //通知 currentPartitionQueue.releaseAllResources(); //释放所有资源 markAsReleased(currentPartitionQueue.getReceiverId()); currentPartitionQueue = null; } channel.writeAndFlush(resp).addListener(writeListener); //真正的发送,WriteAndFlushNextMessageIfPossibleListener会再次调用writeAndFlushNextMessageIfPossible,反复读取
在BufferResponse中,
@Override ByteBuf write(ByteBufAllocator allocator) throws IOException { int length = 16 + 4 + 1 + 4 + buffer.getSize(); ByteBuf result = null; try { result = allocateBuffer(allocator, ID, length); receiverId.writeTo(result); result.writeInt(sequenceNumber); result.writeBoolean(buffer.isBuffer()); result.writeInt(buffer.getSize()); result.writeBytes(buffer.getNioBuffer()); return result; } catch (Throwable t) { if (result != null) { result.release(); } throw new IOException(t); } finally { if (buffer != null) { buffer.recycle(); //回收buffer } } }
Buffer
public void recycle() { synchronized (recycleLock) { if (--referenceCount == 0) { recycler.recycle(memorySegment); } } }
LocalBufferPool
@Override public void recycle(MemorySegment segment) { synchronized (availableMemorySegments) { if (isDestroyed || numberOfRequestedMemorySegments > currentPoolSize) { returnMemorySegment(segment); } else { EventListener<Buffer> listener = registeredListeners.poll(); if (listener == null) { availableMemorySegments.add(segment); //没有listener,直接放回availableMemorySegments,下次使用 availableMemorySegments.notify(); } else { try { listener.onEvent(new Buffer(segment, this)); //如果有listener,直接扔给listener处理 } catch (Throwable ignored) { availableMemorySegments.add(segment); availableMemorySegments.notify(); } } } } }