Flink - ResultPartition - fxjwind

Flink - ResultPartition

发送数据一般通过，collector.collect

public interface Collector<T> {
    
    /**
     * Emits a record.
     * 
     * @param record The record to collect.
     */
    void collect(T record);
    
    /**
     * Closes the collector. If any data was buffered, that data will be flushed.
     */
    void close();
}

output继承，

public interface Output<T> extends Collector<T> {

    /**
     * Emits a {@link Watermark} from an operator. This watermark is broadcast to all downstream
     * operators.
     *
     * <p>A watermark specifies that no element with a timestamp lower or equal to the watermark
     * timestamp will be emitted in the future.
     */
    void emitWatermark(Watermark mark);
}

RecordWriterOutput

public class RecordWriterOutput<OUT> implements Output<StreamRecord<OUT>> {

    private StreamRecordWriter<SerializationDelegate<StreamElement>> recordWriter;
    
    private SerializationDelegate<StreamElement> serializationDelegate;

    @Override
    public void collect(StreamRecord<OUT> record) {
        serializationDelegate.setInstance(record);

        try {
            recordWriter.emit(serializationDelegate);
        }
        catch (Exception e) {
            throw new RuntimeException(e.getMessage(), e);
        }
    }

RecordWriter

public class RecordWriter<T extends IOReadableWritable> {

    protected final ResultPartitionWriter writer; //负责写入ResultPartition的writer

    private final ChannelSelector<T> channelSelector; //选择写入哪个channel，默认RoundRobinChannelSelector

    private final int numChannels;

    /** {@link RecordSerializer} per outgoing channel */
    private final RecordSerializer<T>[] serializers;

    public RecordWriter(ResultPartitionWriter writer) {
        this(writer, new RoundRobinChannelSelector<T>());
    }

    @SuppressWarnings("unchecked")
    public RecordWriter(ResultPartitionWriter writer, ChannelSelector<T> channelSelector) {
        this.writer = writer;
        this.channelSelector = channelSelector;

        this.numChannels = writer.getNumberOfOutputChannels(); //获取channel数

        /**
         * The runtime exposes a channel abstraction for the produced results
         * (see {@link ChannelSelector}). Every channel has an independent
         * serializer.
         */
        this.serializers = new SpanningRecordSerializer[numChannels];
        for (int i = 0; i < numChannels; i++) {
            serializers[i] = new SpanningRecordSerializer<T>(); //为每个channel初始化Serializer
        }
    }

    public void emit(T record) throws IOException, InterruptedException {
        for (int targetChannel : channelSelector.selectChannels(record, numChannels)) { //对于选中的channels
            // serialize with corresponding serializer and send full buffer
            RecordSerializer<T> serializer = serializers[targetChannel];

            synchronized (serializer) { //加锁，一条channel的serializer不能并发写
                SerializationResult result = serializer.addRecord(record);
                while (result.isFullBuffer()) { //buffer,即memorySegment已满
                    Buffer buffer = serializer.getCurrentBuffer(); //将buffer取出

                    if (buffer != null) {
                        writeBuffer(buffer, targetChannel, serializer); //将buffer写入
                    }

                    buffer = writer.getBufferProvider().requestBufferBlocking(); //申请新的buffer
                    result = serializer.setNextBuffer(buffer); //set新的buffer到serializer
                }
            }
        }
    }

writeBuffer

private void writeBuffer(
        Buffer buffer,
        int targetChannel,
        RecordSerializer<T> serializer) throws IOException {

    try {
        writer.writeBuffer(buffer, targetChannel);
    }
    finally {
        serializer.clearCurrentBuffer();
    }
}

可以看到写入和申请buffer都是通过ResultPartitionWriter

public final class ResultPartitionWriter implements EventListener<TaskEvent> {

    private final ResultPartition partition; //Result Partition

    private final TaskEventHandler taskEventHandler = new TaskEventHandler();

    public ResultPartitionWriter(ResultPartition partition) {
        this.partition = partition;
    }

    // ------------------------------------------------------------------------
    // Attributes
    // ------------------------------------------------------------------------

    public ResultPartitionID getPartitionId() {
        return partition.getPartitionId();
    }

    public BufferProvider getBufferProvider() {
        return partition.getBufferProvider();
    }

    public int getNumberOfOutputChannels() {
        return partition.getNumberOfSubpartitions();
    }

    // ------------------------------------------------------------------------
    // Data processing
    // ------------------------------------------------------------------------

    public void writeBuffer(Buffer buffer, int targetChannel) throws IOException {
        partition.add(buffer, targetChannel);
    }
}

而ResultPartitionWriter操作都通过ResultPartition， writerBuffer只是把buffer，add到partition

ResultPartition

初始化的过程，

task初始化ResultPartition，

// Produced intermediate result partitions
this.producedPartitions = new ResultPartition[partitions.size()];
this.writers = new ResultPartitionWriter[partitions.size()];

for (int i = 0; i < this.producedPartitions.length; i++) {
    ResultPartitionDeploymentDescriptor desc = partitions.get(i);
    ResultPartitionID partitionId = new ResultPartitionID(desc.getPartitionId(), executionId);

    this.producedPartitions[i] = new ResultPartition(
            taskNameWithSubtaskAndId,
            jobId,
            partitionId,
            desc.getPartitionType(),
            desc.getEagerlyDeployConsumers(),
            desc.getNumberOfSubpartitions(),
            networkEnvironment.getPartitionManager(),
            networkEnvironment.getPartitionConsumableNotifier(),
            ioManager,
            networkEnvironment.getDefaultIOMode());

    this.writers[i] = new ResultPartitionWriter(this.producedPartitions[i]);
}

在task.run中先到NetworkEnvironment中register，

network.registerTask(this);

这里做的主要的工作是，创建等同于subPartiton大小的localBuffer，并register到ResultPartition

bufferPool = networkBufferPool.createBufferPool(partition.getNumberOfSubpartitions(), false); //创建LocalPool，注意Reqired的segment数目是Subpartitions的数目，即一个subP一个segment
partition.registerBufferPool(bufferPool); //把localPool注册到ResultPartition

所以，

writer.getBufferProvider().requestBufferBlocking();

就是调用localBufferPool.requestBuffer

如果有availableMemorySegments就直接用

如果没有，

if (numberOfRequestedMemorySegments < currentPoolSize) {
    final MemorySegment segment = networkBufferPool.requestMemorySegment(); //如果还有可申请的，就去networkBufferPool申请

    if (segment != null) {
        numberOfRequestedMemorySegments++;
        availableMemorySegments.add(segment);

        continue;
    }
}

if (askToRecycle) { //如果不能申请新的，让owner去试图释放
    owner.releaseMemory(1);
}

if (isBlocking) { //实在不行，blocking等2秒
    availableMemorySegments.wait(2000);
}

public void releaseMemory(int toRelease) throws IOException {
    for (ResultSubpartition subpartition : subpartitions) {
        toRelease -= subpartition.releaseMemory(); //让subpartition去releaseMemory

        // Only release as much memory as needed
        if (toRelease <= 0) {
            break;
        }
    }
}

可以看到，如果在emit的时候，如果没有可用的segment，是会blocking等待的

对于pipelineSubpartition的release，什么都不会做，所以这里如果buffer没有被及时发送出去并回收，会不断的blocking等待

public int releaseMemory() {
    // The pipelined subpartition does not react to memory release requests. The buffers will be
    // recycled by the consuming task.
    return 0;
}

ResultPartition.add

public void add(Buffer buffer, int subpartitionIndex) throws IOException {
    boolean success = false;

    try {
        checkInProduceState();

        final ResultSubpartition subpartition = subpartitions[subpartitionIndex]; //取出index相应的ResultSubpartition

        synchronized (subpartition) {
            success = subpartition.add(buffer); //把buffer add到ResultSubpartition

            // Update statistics
            totalNumberOfBuffers++;
            totalNumberOfBytes += buffer.getSize();
        }
    }
    finally {
        if (success) {
            notifyPipelinedConsumers(); //通知ResultPartitionConsumableNotifier触发notifyPartitionConsumable
        }
        else {
            buffer.recycle(); //失败，回收此buffer
        }
    }
}

对于PipelinedSubpartition，add逻辑就是加入buffer

/**
 * A pipelined in-memory only subpartition, which can be consumed once.
 */
class PipelinedSubpartition extends ResultSubpartition {

    /**
     * A data availability listener. Registered, when the consuming task is faster than the
     * producing task.
     */
    private NotificationListener registeredListener; //来数据后，通知consuming

    /** The read view to consume this subpartition. */
    private PipelinedSubpartitionView readView; //read view

    /** All buffers of this subpartition. Access to the buffers is synchronized on this object. */
    final ArrayDeque<Buffer> buffers = new ArrayDeque<Buffer>(); //buffer队列

    PipelinedSubpartition(int index, ResultPartition parent) {
        super(index, parent);
    }

    @Override
    public boolean add(Buffer buffer) {
        checkNotNull(buffer);

        final NotificationListener listener;

        synchronized (buffers) {
            if (isReleased || isFinished) {
                return false;
            }

            // Add the buffer and update the stats
            buffers.add(buffer); //加入buffer队列
            updateStatistics(buffer);

            // Get the listener...
            listener = registeredListener;
            registeredListener = null;
        }

        // Notify the listener outside of the synchronized block
        if (listener != null) {
            listener.onNotification(); //触发listener
        }

        return true;
    }

NettyConnectionManager

@Override
    public void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher, NetworkBufferPool networkbufferPool)
            throws IOException {
        PartitionRequestProtocol partitionRequestProtocol =
                new PartitionRequestProtocol(partitionProvider, taskEventDispatcher, networkbufferPool);

        client.init(partitionRequestProtocol, bufferPool);
        server.init(partitionRequestProtocol, bufferPool);
    }

PartitionRequestProtocol

// +-------------------------------------------------------------------+
    // |                        SERVER CHANNEL PIPELINE                    |
    // |                                                                   |
    // |    +----------+----------+ (3) write  +----------------------+    |
    // |    | Queue of queues     +----------->| Message encoder      |    |
    // |    +----------+----------+            +-----------+----------+    |
    // |              /|\                                 \|/              |
    // |               | (2) enqueue                       |               |
    // |    +----------+----------+                        |               |
    // |    | Request handler     |                        |               |
    // |    +----------+----------+                        |               |
    // |              /|\                                  |               |
    // |               |                                   |               |
    // |    +----------+----------+                        |               |
    // |    | Message decoder     |                        |               |
    // |    +----------+----------+                        |               |
    // |              /|\                                  |               |
    // |               |                                   |               |
    // |    +----------+----------+                        |               |
    // |    | Frame decoder       |                        |               |
    // |    +----------+----------+                        |               |
    // |              /|\                                  |               |
    // +---------------+-----------------------------------+---------------+
    // |               | (1) client request               \|/
    // +---------------+-----------------------------------+---------------+
    // |               |                                   |               |
    // |       [ Socket.read() ]                    [ Socket.write() ]     |
    // |                                                                   |
    // |  Netty Internal I/O Threads (Transport Implementation)            |
    // +-------------------------------------------------------------------+

    @Override
    public ChannelHandler[] getServerChannelHandlers() {
        PartitionRequestQueue queueOfPartitionQueues = new PartitionRequestQueue();
        PartitionRequestServerHandler serverHandler = new PartitionRequestServerHandler(
                partitionProvider, taskEventDispatcher, queueOfPartitionQueues, networkbufferPool);

        return new ChannelHandler[] {
                messageEncoder,
                createFrameLengthDecoder(),
                messageDecoder,
                serverHandler,
                queueOfPartitionQueues
        };
    }

PartitionRequestServerHandler

ServerHandler会分配大小至少为1的bufferpool，因为后面是false，意思是如果networkbufferpool有多余的segment，会分配进来

public void channelRegistered(ChannelHandlerContext ctx) throws Exception {
    super.channelRegistered(ctx);

    bufferPool = networkBufferPool.createBufferPool(1, false);
}

protected void channelRead0(ChannelHandlerContext ctx, NettyMessage msg) throws Exception

PartitionRequest request = (PartitionRequest) msg;

LOG.debug("Read channel on {}: {}.", ctx.channel().localAddress(), request);

try {
    ResultSubpartitionView subpartition =
            partitionProvider.createSubpartitionView(
                    request.partitionId,
                    request.queueIndex,
                    bufferPool);

    outboundQueue.enqueue(subpartition, request.receiverId); //放入PartitionRequestQueue，进行发送
}

ResultPartitionManager继承自partitionProvider

调用ResultPartitionManager.createSubpartitionView

synchronized (registeredPartitions) {
    final ResultPartition partition = registeredPartitions.get(partitionId.getProducerId(),
            partitionId.getPartitionId());

    return partition.createSubpartitionView(subpartitionIndex, bufferProvider);
}

ResultPartition

public ResultSubpartitionView createSubpartitionView(int index, BufferProvider bufferProvider) throws IOException {
    int refCnt = pendingReferences.get();

    checkState(refCnt != -1, "Partition released.");
    checkState(refCnt > 0, "Partition not pinned.");

    ResultSubpartitionView readView = subpartitions[index].createReadView(bufferProvider);

    return readView;
}

pendingReferences的定义

/**
 * The total number of references to subpartitions of this result. The result partition can be
 * safely released, iff the reference count is zero. A reference count of -1 denotes that the
 * result partition has been released.
 */
private final AtomicInteger pendingReferences = new AtomicInteger();

PipelinedSubpartitionView

class PipelinedSubpartitionView implements ResultSubpartitionView {

    /** The subpartition this view belongs to. */
    private final PipelinedSubpartition parent;

    /** Flag indicating whether this view has been released. */
    private AtomicBoolean isReleased = new AtomicBoolean();

    PipelinedSubpartitionView(PipelinedSubpartition parent) {
        this.parent = checkNotNull(parent);
    }

    @Override
    public Buffer getNextBuffer() {
        synchronized (parent.buffers) {
            return parent.buffers.poll(); //从parent，PipelinedSubpartition，的buffers里面直接poll
        }
    }

    @Override
    public boolean registerListener(NotificationListener listener) {
        return !isReleased.get() && parent.registerListener(listener);
    }

    @Override
    public void notifySubpartitionConsumed() { //消费完释放该Subpartition
        releaseAllResources();
    }

    @Override
    public void releaseAllResources() {
        if (isReleased.compareAndSet(false, true)) {
            // The view doesn't hold any resources and the parent cannot be restarted. Therefore,
            // it's OK to notify about consumption as well.
            parent.onConsumedSubpartition();
        }
    }

PartitionRequestQueue在发送数据的时候，会调用getNextBuffer获取数据

发送完，即收到EndOfPartitionEvent后，调用notifySubpartitionConsumed

释放会调用到，PipelinedSubpartition.onConsumedSubpartition –> ResultPartition.onConsumedSubpartition

void onConsumedSubpartition(int subpartitionIndex) {

    if (isReleased.get()) { //如果已经释放了
        return;
    }

    int refCnt = pendingReferences.decrementAndGet(); //一个Subpartition消费完，减1

    if (refCnt == 0) { //如果所有subPartition消费完
        partitionManager.onConsumedPartition(this); //通知partitionManager，release这个partition
    }
}

PipelinedSubpartition中的buffer，何时被释放，放回localbufferpool

具体看下，PartitionRequestQueue的发送过程，

writeAndFlushNextMessageIfPossible

buffer = currentPartitionQueue.getNextBuffer();

BufferResponse resp = new BufferResponse(buffer, currentPartitionQueue.getSequenceNumber(), currentPartitionQueue.getReceiverId()); //将buffer封装成BufferResponse

if (!buffer.isBuffer() &&
        EventSerializer.fromBuffer(buffer, getClass().getClassLoader()).getClass() == EndOfPartitionEvent.class) { //如果收到partition结束event

    currentPartitionQueue.notifySubpartitionConsumed(); //通知
    currentPartitionQueue.releaseAllResources();  //释放所有资源
    markAsReleased(currentPartitionQueue.getReceiverId());

    currentPartitionQueue = null;
}

channel.writeAndFlush(resp).addListener(writeListener); //真正的发送，WriteAndFlushNextMessageIfPossibleListener会再次调用writeAndFlushNextMessageIfPossible，反复读取

在BufferResponse中，

@Override
ByteBuf write(ByteBufAllocator allocator) throws IOException {
    int length = 16 + 4 + 1 + 4 + buffer.getSize();

    ByteBuf result = null;
    try {
        result = allocateBuffer(allocator, ID, length);

        receiverId.writeTo(result);
        result.writeInt(sequenceNumber);
        result.writeBoolean(buffer.isBuffer());
        result.writeInt(buffer.getSize());
        result.writeBytes(buffer.getNioBuffer());

        return result;
    }
    catch (Throwable t) {
        if (result != null) {
            result.release();
        }

        throw new IOException(t);
    }
    finally {
        if (buffer != null) {
            buffer.recycle(); //回收buffer
        }
    }
}

Buffer

public void recycle() {
    synchronized (recycleLock) {
        if (--referenceCount == 0) {
            recycler.recycle(memorySegment);
        }
    }
}

LocalBufferPool

@Override
public void recycle(MemorySegment segment) {
    synchronized (availableMemorySegments) {
        if (isDestroyed || numberOfRequestedMemorySegments > currentPoolSize) {
            returnMemorySegment(segment);
        }
        else {
            EventListener<Buffer> listener = registeredListeners.poll();

            if (listener == null) {
                availableMemorySegments.add(segment); //没有listener，直接放回availableMemorySegments，下次使用
                availableMemorySegments.notify();
            }
            else {
                try {
                    listener.onEvent(new Buffer(segment, this)); //如果有listener，直接扔给listener处理
                }
                catch (Throwable ignored) {
                    availableMemorySegments.add(segment);
                    availableMemorySegments.notify();
                }
            }
        }
    }
}

posted on 2017-10-09 15:34 fxjwind 阅读(1329) 评论(0) 编辑收藏举报

刷新页面返回顶部

fxjwind

Flink - ResultPartition

导航

公告