浅浅的源码剖析grpc-go（一）

最近在学习 rpc 相关的知识，如果让我去从头设计一个 rpc，我从使用者的角度出发，究竟需要去做一下什么工作？

第一，RPC 本质上就是一个远程调用，那肯定就需要通过网络来传输数据。虽然传输协议可以有多种选择，但考虑到可靠性的话，我们一般默认采用 TCP 协议。为了屏蔽网络传输的复杂性，我们需要封装一个单独的数据传输模块用来收发二进制数据，这个单独模块我们可以叫做传输模块。

第二，用户请求的时候是基于方法调用，方法出入参数都是对象数据，对象是肯定没法直接在网络中传输的，我们需要提前把它转成可传输的二进制，这就是我们说的序列化过程。但只是把方法调用参数的二进制数据传输到服务提供方是不够的，我们需要在方法调用参数的二进制数据后面增加“断句”符号来分隔出不同的请求，在两个“断句”符号中间放的内容就是我们请求的二进制数据，这个过程我们叫做协议封装。

第三，为了传输性能的进一步提高，我们还可以在协议模块中加入压缩功能，这是因为压缩过程也是对传输的二进制数据进行操作。在实际的网络传输过程中，我们的请求数据包在数据链路层可能会因为太大而被拆分成多个数据包进行传输，为了减少被拆分的次数，从而导致整个传输过程时间太长的问题，这就是数据压缩和解压。

第四，在针对服务提供方不断增多的情况，会出现一个接口对应多个IP+端口的服务地址映射关系，但这多个服务提供者对于我们的调用方来说是透明的，所以在 RPC 里面我们还需要给调用方找到所有的服务提供方，并需要在 RPC 里面维护好接口跟服务提供者地址的关系，这样调用方在发起请求的时候才能快速地找到对应的接收地址，这就是我们常说的服务发现。

第五，作为一个成熟的 rpc 框架，还需要给开发者提供各种各样能够深入介入 rpc 框架内部的接口调用。这里可以看看 grpc 各种各样的 wrapper 封装，在传输，编码，协议，服务发现，日志打印都留有标准统一的对外接口，让开发人员可以自定义开发。

在考虑完以上问题后，我对 grpc 的实现比较好奇，于是搭建了 client 和 service 想从代码的维度上去查看一下 grpc 在 rpc 上是怎么实现的？

client代码

package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "grpc-test/client/service/pbservice"

    "google.golang.org/grpc"
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
)

func main() {
    // 1. 新建连接，端口是服务端开放的8082端口
    // 并且添加grpc.WithInsecure()，不然没有证书会报错
    conn, err := grpc.Dial(":8082", grpc.WithInsecure())
    if err != nil {
        log.Fatal(err)
    }

    // 退出时关闭链接
    defer conn.Close()

    // 2. 调用Product.pb.go中的NewProdServiceClient方法
    productServiceClient := pbservice.NewProdServiceClient(conn)


    // 设置了超时 15s
    ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Duration(10 * time.Second)))
    resp, err := productServiceClient.GetProductStock(ctx, &pbservice.ProductRequest{ProdId: 233})
    defer cancel()
    if err != nil {
        state, ok := status.FromError(err)
        if ok && state.Code() == codes.DeadlineExceeded {
            log.Fatalln("client.Search err: deadline")
        }
        log.Fatal("调用gRPC方法错误: ", err)
    }

    fmt.Println("调用gRPC方法成功，ProdStock = ", resp.ProdStock)
}

service代码

package main

import (
    "google.golang.org/grpc"
    "grpc-test/server/pbservice/product"
    "net"
    "log"
)

func main() {
    rpcServer := grpc.NewServer()

    //将新建的ProdService注册进去，注意这里的 RegisterProdServiceServer 方法是将新生成的 service 中的方法
    product.RegisterProdServiceServer(rpcServer, new(product.ProdService))

    listener, err := net.Listen("tcp", ":8082")
    if err != nil {
        log.Fatal("服务监听端口失败", err)
    }

    // 运行rpcServer，传入listener
    _ = rpcServer.Serve(listener)
}

接下来我们追踪一下 client 部分的代码：

# 创建连接
conn, err := grpc.Dial(":8082", grpc.WithInsecure())
if err != nil {
    log.Fatal(err)
}

# 具体的初始化操作在grpc.DialContext方法中。
# DialContext首先初始化空对象ClientConn，然后判断opts …DialOption数据是否存在，如果存在就执行传入的函数并设置特定属性。
// Dial creates a client connection to the given target.
func Dial(target string, opts ...DialOption) (*ClientConn, error) {
    return DialContext(context.Background(), target, opts...)
}

在 clientconn.go 中创建ClientConn对象，对应结构体包括的字段

// ClientConn represents a virtual connection to a conceptual endpoint, to
// perform RPCs.
//
// A ClientConn is free to have zero or more actual connections to the endpoint
// based on configuration, load, etc. It is also free to determine which actual
// endpoints to use and may change it every RPC, permitting client-side load
// balancing.
//
// A ClientConn encapsulates a range of functionality including name
// resolution, TCP connection establishment (with retries and backoff) and TLS
// handshakes. It also handles errors on established connections by
// re-resolving the name and reconnecting.
type ClientConn struct {
    ctx    context.Context
    cancel context.CancelFunc

    target       string
    parsedTarget resolver.target   // 负载均衡选择
    authority    string
    dopts        dialOptions                // 初始化可设置选项，在每一次请求会带上，看call.go中的combine方法
    csMgr        *connectivityStateManager  // 连接状态维护

    balancerBuildOpts balancer.BuildOptions
    blockingpicker    *pickerWrapper        // 负载均衡设置

    mu              sync.RWMutex
    resolverWrapper *ccResolverWrapper      // 实现了resolver.ClientConn，位于./resolver/resolver.go中，ClientConn的上层包装器
    sc              *ServiceConfig
    conns           map[*addrConn]struct{}  // 存放连接的地方
    // Keepalive parameter can be updated if a GoAway is received.
    mkp             keepalive.ClientParameters
    curBalancerName string
    balancerWrapper *ccBalancerWrapper      // 负载均衡器上的包装器
    retryThrottler  atomic.Value

    firstResolveEvent *grpcsync.Event

    channelzID int64 // channelz unique identification number
    czData     *channelzData
}

当pb自动生成的代码中，调用方法的时候，grpc 会将其请求代理替换成自己封装的网络层上请求，使用连接的 Invoke() 方法

func (c *prodServiceClient) GetProductStock(ctx context.Context, in *ProductRequest, opts ...grpc.CallOption) (*ProductResponse, error) {
    out := new(ProductResponse)
    err := c.cc.Invoke(ctx, "/service.ProdService/GetProductStock", in, out, opts...)
    if err != nil {
        return nil, err
    }
    return out, nil
}

在 call.go 内部的 invoke() 方法中，grpc 主要做了三件事情：建立获取连接（newClientStream函数），发送数据（SendMSg()函数），接收数据（RecvMsg()函数）

func invoke(ctx context.Context, method string, req, reply interface{}, cc *ClientConn, opts ...CallOption) error {
    cs, err := newClientStream(ctx, unaryStreamDesc, cc, method, opts...)
    if err != nil {
        return err
    }
    if err := cs.SendMsg(req); err != nil {
        return err
    }
    return cs.RecvMsg(reply)
}

上面的还是比较容易看懂的，但是深入到内部传输层的处理的时候会发现 grpc 很多晦涩难懂的逻辑，只能强忍着往下看了😂😂

newClientStream分析

具体的流程为：

1、运行传入的opts，在请求前调用 before()

2、设置压缩算法 compressor

3、设置请求的 ctx，包括链路追中 trace 还有编码，压缩等等信息

4、生成 clientStream，负责从流中发送数据

5、发送请求生成 newAttemptLocked 和 withRetry

6、处理 ctx 监听上下文是否关闭

func newClientStream(ctx context.Context, desc *StreamDesc, cc *ClientConn, method string, opts ...CallOption) (_ ClientStream, err error) {
    // channelz 主要用于保存不同 groutine 公用的 channel 数据读取和写入，避免竞态
    if channelz.IsOn() {
        cc.incrCallsStarted()
        defer func() {
            if err != nil {
                cc.incrCallsFailed()
            }
        }()
    }

    // 可以设置 callInfo，在请求的时候掌控 stream流 和 codec编码等等
    c := defaultCallInfo()

    // Provide an opportunity for the first RPC to see the first service config
    // provided by the resolver.
    if err := cc.waitForResolvedAddrs(ctx); err != nil {
        return nil, err
    }
    mc := cc.GetMethodConfig(method)
    if mc.WaitForReady != nil {
        c.failFast = !*mc.WaitForReady
    }

    // ... 省略
    
    // 这里根据传入的 opts, 可以在请求前运行 CallOption 的埋点设置，让开发者可以介入
    for _, o := range opts {
        if err := o.before(c); err != nil {
            return nil, toRPCErr(err)
        }
    }
    c.maxSendMessageSize = getMaxSize(mc.MaxReqSize, c.maxSendMessageSize, defaultClientMaxSendMessageSize)
    c.maxReceiveMessageSize = getMaxSize(mc.MaxRespSize, c.maxReceiveMessageSize, defaultClientMaxReceiveMessageSize)
    if err := setCallInfoCodec(c); err != nil {
        return nil, err
    }

    callHdr := &transport.CallHdr{
        Host:           cc.authority,
        Method:         method,
        ContentSubtype: c.contentSubtype,
    }

    // 这里可以设置自定义的解压缩 CallOption，这里只要 c.compressorType 声明 grpc 支持的压缩格式就行了，比如 gzip 
    // 当然你也可以自己声明相关的解压，具体可以参考 encoding/encoding.go 中，自行注册相关的压缩支持
    // Set our outgoing compression according to the UseCompressor CallOption, if
    // set.  In that case, also find the compressor from the encoding package.
    // Otherwise, use the compressor configured by the WithCompressor DialOption,
    // if set.
    var cp Compressor
    var comp encoding.Compressor
    if ct := c.compressorType; ct != "" {
        callHdr.SendCompress = ct
        if ct != encoding.Identity {
            comp = encoding.GetCompressor(ct)
            if comp == nil {
                return nil, status.Errorf(codes.Internal, "grpc: Compressor is not installed for requested grpc-encoding %q", ct)
            }
        }
    } else if cc.dopts.cp != nil {
        callHdr.SendCompress = cc.dopts.cp.Type()
        cp = cc.dopts.cp
    }
    if c.creds != nil {
        callHdr.Creds = c.creds
    }

    // grpc 如果开启了 trace 跟踪，会生成的相关 context，并将它包装到 ctx 内，透传到请求服务
    var trInfo *traceInfo
    if EnableTracing {
        trInfo = &traceInfo{
            tr: trace.New("grpc.Sent."+methodFamily(method), method),
            firstLine: firstLine{
                client: true,
            },
        }
        if deadline, ok := ctx.Deadline(); ok {
            trInfo.firstLine.deadline = time.Until(deadline)
        }
        trInfo.tr.LazyLog(&trInfo.firstLine, false)
        ctx = trace.NewContext(ctx, trInfo.tr)
    }


    // rpc 请求的 ctx 会包含：快速失败标志-failFast，编码-codec，解压缩-cp和comp
    ctx = newContextWithRPCInfo(ctx, c.failFast, c.codec, cp, comp)
    // ... 省略

    cs := &clientStream{
        callHdr:      callHdr,
        ctx:          ctx,
        methodConfig: &mc,
        opts:         opts,
        callInfo:     c,
        cc:           cc,
        desc:         desc,
        codec:        c.codec,
        cp:           cp,
        comp:         comp,
        cancel:       cancel,
        beginTime:    beginTime,
        firstAttempt: true,
    }
    // 设置答应的日志
    // ...省略
    cs.binlog = binarylog.GetMethodLogger(method)

    // 请求的具体过程：newAttemptLocked 和 withRetry
    cs.callInfo.stream = cs
    // Only this initial attempt has stats/tracing.
    // TODO(dfawley): move to newAttempt when per-attempt stats are implemented.
    if err := cs.newAttemptLocked(sh, trInfo); err != nil {
        cs.finish(err)
        return nil, err
    }

    op := func(a *csAttempt) error { return a.newStream() }
    if err := cs.withRetry(op, func() { cs.bufferForRetryLocked(0, op) }); err != nil {
        cs.finish(err)
        return nil, err
    }

    // ...省略
    // 这里监听连接中 ctx 上下文是否关闭，如果关闭，需要 clientStream 去 finish() 关闭这个链接
    if desc != unaryStreamDesc {
        // Listen on cc and stream contexts to cleanup when the user closes the
        // ClientConn or cancels the stream context.  In all other cases, an error
        // should already be injected into the recv buffer by the transport, which
        // the client will eventually receive, and then we will cancel the stream's
        // context in clientStream.finish.
        go func() {
            select {
            case <-cc.ctx.Done():
                cs.finish(ErrClientConnClosing)
            case <-ctx.Done():
                cs.finish(toRPCErr(ctx.Err()))
            }
        }()
    }
    return cs, nil
}

grpc 中 callInfo 请求参数结构体为：

// callInfo contains all related configuration and information about an RPC.
type callInfo struct {
    compressorType        string
    failFast              bool
    stream                ClientStream
    maxReceiveMessageSize *int
    maxSendMessageSize    *int
    creds                 credentials.PerRPCCredentials
    contentSubtype        string
    codec                 baseCodec
    maxRetryRPCBufferSize int
}

获取连接 transport 函数主要为 clientStream.newAttemptLoacked 来进行处理：

// newAttemptLocked creates a new attempt with a transport.
// If it succeeds, then it replaces clientStream's attempt with this new attempt.
func (cs *clientStream) newAttemptLocked(sh stats.Handler, trInfo *traceInfo) (retErr error) {
    // newAttempt 就是连接结构体
    newAttempt := &csAttempt{
        cs:           cs,
        dc:           cs.cc.dopts.dc,
        statsHandler: sh,
        trInfo:       trInfo,
    }
    defer func() {
        if retErr != nil {
            // This attempt is not set in the clientStream, so it's finish won't
            // be called. Call it here for stats and trace in case they are not
            // nil.
            newAttempt.finish(retErr)
        }
    }()

    if err := cs.ctx.Err(); err != nil {
        return toRPCErr(err)
    }

    // 从 clientconn 内获取一条可用的传输连接
    t, done, err := cs.cc.getTransport(cs.ctx, cs.callInfo.failFast, cs.callHdr.Method)
    if err != nil {
        return err
    }
    if trInfo != nil {
        trInfo.firstLine.SetRemoteAddr(t.RemoteAddr())
    }
    newAttempt.t = t
    newAttempt.done = done
    cs.attempt = newAttempt
    return nil
}

在 getTransport 中，具体提供的连接为 pick 方法，根据 picker_wrapper.go 中获取的，在看到 wrapper 你应该知道这个是可以开发者自行定义的，实际上在负载均衡算法中，确实可以自定义选择不同的连接，我们来看看 pick 内具体是做什么的？

pick在设计的时候是阻塞的方式的来调用的，cc.blockingpicker.pick()

func (cc *ClientConn) getTransport(ctx context.Context, failfast bool, method string) (transport.ClientTransport, func(balancer.DoneInfo), error) {
    t, done, err := cc.blockingpicker.pick(ctx, failfast, balancer.PickInfo{
        Ctx:            ctx,
        FullMethodName: method,
    })
    if err != nil {
        return nil, nil, toRPCErr(err)
    }
    return t, done, nil
}


// 在以下情况下可能会阻塞：
// - 没有选择器
// - 当前选择器返回 ErrNoSubConnAvailable
// - 当前选择器返回其他错误并且 failfast 为 false。
// - 当前选择器返回的 subConn 不是 READY
// pick returns the transport that will be used for the RPC.
// It may block in the following cases:
// - there's no picker
// - the current picker returns ErrNoSubConnAvailable
// - the current picker returns other errors and failfast is false.
// - the subConn returned by the current picker is not READY
// When one of these situations happens, pick blocks until the picker gets updated.
func (pw *pickerWrapper) pick(ctx context.Context, failfast bool, info balancer.PickInfo) (transport.ClientTransport, func(balancer.DoneInfo), error) {
    var ch chan struct{}

    var lastPickErr error

    // 开启循环，直到能从 balance 连接池中获取符合要求的连接
    for {
        pw.mu.Lock()
        if pw.done {
            pw.mu.Unlock()
            return nil, nil, ErrClientConnClosing
        }

        if pw.picker == nil {
            ch = pw.blockingCh
        }
        if ch == pw.blockingCh {
            // 为了避免 pick 本身调用会引起竞态，这里也要用信道来控制了pick的调用
            // pw.blockingCh 为控制只有一个协程获取到执行权
            // 如果 pw.picker is nil 或者 有其他的协程在调用pick方法都会引起阻塞

            // This could happen when either:
            // - pw.picker is nil (the previous if condition), or
            // - has called pick on the current picker.
            pw.mu.Unlock()
            select {
            case <-ctx.Done():
                var errStr string
                if lastPickErr != nil {
                    errStr = "latest balancer error: " + lastPickErr.Error()
                } else if connectionErr := pw.connectionError(); connectionErr != nil {
                    errStr = "latest connection error: " + connectionErr.Error()
                } else {
                    errStr = ctx.Err().Error()
                }
                switch ctx.Err() {
                case context.DeadlineExceeded:
                    return nil, nil, status.Error(codes.DeadlineExceeded, errStr)
                case context.Canceled:
                    return nil, nil, status.Error(codes.Canceled, errStr)
                }
            case <-ch:
            }
            continue
        }

        ch = pw.blockingCh
        p := pw.picker
        pw.mu.Unlock()

        // 调用 pickerWrapper 的 Pick() 方法获取有用的一个连接
        pickResult, err := p.Pick(info)
        // 错误处理
        if err != nil {
            // ... 省略
        }

        // 这里 SubConn 是获取了一个连接，但是这个连接可能还没 READY，对于准备好的连接才会返回
        acw, ok := pickResult.SubConn.(*acBalancerWrapper)
        if !ok {
            grpclog.Error("subconn returned from pick is not *acBalancerWrapper")
            continue
        }
        if t, ok := acw.getAddrConn().getReadyTransport(); ok {
            if channelz.IsOn() {
                return t, doneChannelzWrapper(acw, pickResult.Done), nil
            }
            return t, pickResult.Done, nil
        }
        if pickResult.Done != nil {
            // Calling done with nil error, no bytes sent and no bytes received.
            // DoneInfo with default value works.
            pickResult.Done(balancer.DoneInfo{})
        }
        grpclog.Infof("blockingPicker: the picked transport is not ready, loop back to repick")
        // If ok == false, ac.state is not READY.
        // A valid picker always returns READY subConn. This means the state of ac
        // just changed, and picker will be updated shortly.
        // continue back to the beginning of the for loop to repick.
    }
}

在这里会有个疑问为啥 pick 调用后获取的连接还要根据 acw.getAddrConn().getReadyTransport() 去判断是否 ready ？其实这个问题细想一下就知道了。pick方法返回的只是一组（ip+端口）的服务可用连接，但是这个时候是没办法判断是否可用的。对于服务的上线和下线，那么有可能从 ip 和端口还没从连接池中摘除，那么判断是否可用的逻辑应该交给外部的 balace_wrapper 来做，所以用 SubConn 来进行解耦。从信道中获取可用的 address

type SubConn interface {
    // UpdateAddresses updates the addresses used in this SubConn.
    // gRPC checks if currently-connected address is still in the new list.
    // If it's in the list, the connection will be kept.
    // If it's not in the list, the connection will gracefully closed, and
    // a new connection will be created.
    //
    // This will trigger a state transition for the SubConn.
    UpdateAddresses([]resolver.Address)
    // Connect starts the connecting for this SubConn.
    Connect()
}

在获取到可用的 transport 其实就是一个网络接口句柄，关于句柄一般都是 Close()，Write() 等等这些方法

// ClientTransport is the common interface for all gRPC client-side transport
// implementations.
type ClientTransport interface {
    // Close tears down this transport. Once it returns, the transport
    // should not be accessed any more. The caller must make sure this
    // is called only once.
    Close() error

    // GracefulClose starts to tear down the transport: the transport will stop
    // accepting new RPCs and NewStream will return error. Once all streams are
    // finished, the transport will close.
    //
    // It does not block.
    GracefulClose()

    // Write sends the data for the given stream. A nil stream indicates
    // the write is to be performed on the transport as a whole.
    Write(s *Stream, hdr []byte, data []byte, opts *Options) error

    // NewStream creates a Stream for an RPC.
    NewStream(ctx context.Context, callHdr *CallHdr) (*Stream, error)

    // CloseStream clears the footprint of a stream when the stream is
    // not needed any more. The err indicates the error incurred when
    // CloseStream is called. Must be called when a stream is finished
    // unless the associated transport is closing.
    CloseStream(stream *Stream, err error)

    // Error returns a channel that is closed when some I/O error
    // happens. Typically the caller should have a goroutine to monitor
    // this in order to take action (e.g., close the current transport
    // and create a new one) in error case. It should not return nil
    // once the transport is initiated.
    Error() <-chan struct{}

    // GoAway returns a channel that is closed when ClientTransport
    // receives the draining signal from the server (e.g., GOAWAY frame in
    // HTTP/2).
    GoAway() <-chan struct{}

    // GetGoAwayReason returns the reason why GoAway frame was received.
    GetGoAwayReason() GoAwayReason

    // RemoteAddr returns the remote network address.
    RemoteAddr() net.Addr

    // IncrMsgSent increments the number of message sent through this transport.
    IncrMsgSent()

    // IncrMsgRecv increments the number of message received through this transport.
    IncrMsgRecv()
}

获取到连接后，判断这个连接是不是已经建立了连接了，如果还是空闲的 idle 那么直接 connect() 一下

func (ac *addrConn) getReadyTransport() (transport.ClientTransport, bool) {
    ac.mu.Lock()
    if ac.state == connectivity.Ready && ac.transport != nil {
        t := ac.transport
        ac.mu.Unlock()
        return t, true
    }
    var idle bool
    if ac.state == connectivity.Idle {
        idle = true
    }
    ac.mu.Unlock()
    // Trigger idle ac to connect.
    if idle {
        ac.connect()
    }
    return nil, false
}

SendMsg分析

上面就是整个 newClientStream 的过程了。那么发送过程中 grpc 又是怎么做的呢？

func (cs *clientStream) SendMsg(m interface{}) (err error) {
    defer func() {
        if err != nil && err != io.EOF {
            // Call finish on the client stream for errors generated by this SendMsg
            // call, as these indicate problems created by this client.  (Transport
            // errors are converted to an io.EOF error in csAttempt.sendMsg; the real
            // error will be returned from RecvMsg eventually in that case, or be
            // retried.)
            cs.finish(err)
        }
    }()
    
    // 使用重试的方式 withRetry() 来调用发送 sendMsg()
    msgBytes := data // Store the pointer before setting to nil. For binary logging.
    op := func(a *csAttempt) error {
        err := a.sendMsg(m, hdr, payload, data)
        // nil out the message and uncomp when replaying; they are only needed for
        // stats which is disabled for subsequent attempts.
        m, data = nil, nil
        return err
    }
    err = cs.withRetry(op, func() { cs.bufferForRetryLocked(len(hdr)+len(payload), op) })
    if cs.binlog != nil && err == nil {
        cs.binlog.Log(&binarylog.ClientMessage{
            OnClientSide: true,
            Message:      msgBytes,
        })
    }
    return
}

在 sendMsg() 函数中其实非常好理解，就是拿 clientStream 已经初始化获取到的 transport 连接，调用 write() 方法，在网络句柄上将数据发送出去。

// 先编码和组装协议头和协议data
// prepareMsg returns the hdr, payload and data
// using the compressors passed or using the
// passed preparedmsg
func prepareMsg(m interface{}, codec baseCodec, cp Compressor, comp encoding.Compressor) (hdr, payload, data []byte, err error) {
    if preparedMsg, ok := m.(*PreparedMsg); ok {
        return preparedMsg.hdr, preparedMsg.payload, preparedMsg.encodedData, nil
    }
    // The input interface is not a prepared msg.
    // Marshal and Compress the data at this point
    data, err = encode(codec, m)
    if err != nil {
        return nil, nil, nil, err
    }
    compData, err := compress(data, cp, comp)
    if err != nil {
        return nil, nil, nil, err
    }
    hdr, payload = msgHeader(data, compData)
    return hdr, payload, data, nil
}

// 再进行发送
func (a *csAttempt) sendMsg(m interface{}, hdr, payld, data []byte) error {
    cs := a.cs
    if a.trInfo != nil {
        a.mu.Lock()
        if a.trInfo.tr != nil {
            a.trInfo.tr.LazyLog(&payload{sent: true, msg: m}, true)
        }
        a.mu.Unlock()
    }
    if err := a.t.Write(a.s, hdr, payld, &transport.Options{Last: !cs.desc.ClientStreams}); err != nil {
        if !cs.desc.ClientStreams {
            // For non-client-streaming RPCs, we return nil instead of EOF on error
            // because the generated code requires it.  finish is not called; RecvMsg()
            // will call it with the stream's status independently.
            return nil
        }
        return io.EOF
    }
    if a.statsHandler != nil {
        a.statsHandler.HandleRPC(cs.ctx, outPayload(true, m, data, payld, time.Now()))
    }
    if channelz.IsOn() {
        a.t.IncrMsgSent()
    }
    return nil
}

接着的 withRetry() 方法调用流程比较长，其实也就是上面提到的 connect()，根据连接尝试连接

func (cs *clientStream) withRetry(op func(a *csAttempt) error, onSuccess func()) error {
    for {
        // retryLocked很重要，到底做什么呢？
        if err := cs.retryLocked(err); err != nil {
            cs.mu.Unlock()
            return err
        }
    }
}

func (cs *clientStream) retryLocked(lastErr error) error {
    for {
        // 看newAttemptLocked
        if err := cs.newAttemptLocked(nil, nil); err != nil {
            return err
        }
    }
}

func (cs *clientStream) newAttemptLocked(sh stats.Handler, trInfo *traceInfo) (retErr error) {
    newAttempt := &csAttempt{
        cs:           cs,
        dc:           cs.cc.dopts.dc,
        statsHandler: sh,
        trInfo:       trInfo,
    }
    // 每次getTransport获取使用的连接，cc是ClientConn对象，涉及负载均衡了。
    t, done, err := cs.cc.getTransport(ctx, cs.callInfo.failFast, cs.callHdr.Method)
    if err != nil {
        return err
    }
    if trInfo != nil {
        trInfo.firstLine.SetRemoteAddr(t.RemoteAddr())
    }
    newAttempt.t = t
    newAttempt.done = done
    cs.attempt = newAttempt
    return nil
}

func (cc *ClientConn) getTransport(ctx context.Context, failfast bool, method string) (transport.ClientTransport, func(balancer.DoneInfo), error) {
    // 关注pick方法 
    t, done, err := cc.blockingpicker.pick(ctx, failfast, balancer.PickInfo{
        Ctx:            ctx,
        FullMethodName: method,
    })
}

func (pw *pickerWrapper) pick(ctx context.Context, failfast bool, info balancer.PickInfo) (transport.ClientTransport, func(balancer.DoneInfo), error) {
    // 选择满足条件的transport
        if t, ok := acw.getAddrConn().getReadyTransport(); ok {
            if channelz.IsOn() {
                return t, doneChannelzWrapper(acw, pickResult.Done), nil
            }
            return t, pickResult.Done, nil
        }
    }
}

func (ac *addrConn) getReadyTransport() (transport.ClientTransport, bool) {
    // 创建连接 
    ac.connect()
    return nil, false
}

func (ac *addrConn) connect() error {
    // 异步连接 
    go ac.resetTransport()
    return nil
}

func (ac *addrConn) resetTransport() {
    for i := 0; ; i++ {
        // 创建连接，如果有一个创建成功，返回
        newTr, addr, reconnect, err := ac.tryAllAddrs(addrs, connectDeadline)
    }
}

func (ac *addrConn) createTransport(addr resolver.Address, copts transport.ConnectOptions, connectDeadline time.Time) (transport.ClientTransport, *grpcsync.Event, error) {
    // NewClientTransport创建 
    newTr, err := transport.NewClientTransport(connectCtx, ac.cc.ctx, addr, copts, onPrefaceReceipt, onGoAway, onClose)
}

func NewClientTransport(connectCtx, ctx context.Context, addr resolver.Address, opts ConnectOptions, onPrefaceReceipt func(), onGoAway func(GoAwayReason), onClose func()) (ClientTransport, error) {
    return newHTTP2Client(connectCtx, ctx, addr, opts, onPrefaceReceipt, onGoAway, onClose)
}

// 最关键的方法来了，太多的细节，在这里关注如何接受incoming消息 
func newHTTP2Client(connectCtx, ctx context.Context, addr resolver.Address, opts ConnectOptions, onPrefaceReceipt func(), onGoAway func(GoAwayReason), onClose func()) (_ *http2Client, err error) {
    // Start the reader goroutine for incoming message. Each transport has
    // a dedicated goroutine which reads HTTP2 frame from network. Then it
    // dispatches the frame to the corresponding stream entity.
    go t.reader()
}

// 处理http2数据和server端对应。
func (t *http2Client) reader() {
    defer close(t.readerDone)
    // Check the validity of server preface.
    frame, err := t.framer.fr.ReadFrame()
    if err != nil {
        t.Close() // this kicks off resetTransport, so must be last before return
        return
    }
    t.conn.SetReadDeadline(time.Time{}) // reset deadline once we get the settings frame (we didn't time out, yay!)
    if t.keepaliveEnabled {
        atomic.StoreInt64(&t.lastRead, time.Now().UnixNano())
    }
    sf, ok := frame.(*http2.SettingsFrame)
    if !ok {
        t.Close() // this kicks off resetTransport, so must be last before return
        return
    }
    t.onPrefaceReceipt()
    t.handleSettings(sf, true)

    // loop to keep reading incoming messages on this transport.
    for {
        t.controlBuf.throttle()
        frame, err := t.framer.fr.ReadFrame()
        if t.keepaliveEnabled {
            atomic.StoreInt64(&t.lastRead, time.Now().UnixNano())
        }
        if err != nil {
            // Abort an active stream if the http2.Framer returns a
            // http2.StreamError. This can happen only if the server's response
            // is malformed http2.
            if se, ok := err.(http2.StreamError); ok {
                t.mu.Lock()
                s := t.activeStreams[se.StreamID]
                t.mu.Unlock()
                if s != nil {
                    // use error detail to provide better err message
                    code := http2ErrConvTab[se.Code]
                    errorDetail := t.framer.fr.ErrorDetail()
                    var msg string
                    if errorDetail != nil {
                        msg = errorDetail.Error()
                    } else {
                        msg = "received invalid frame"
                    }
                    t.closeStream(s, status.Error(code, msg), true, http2.ErrCodeProtocol, status.New(code, msg), nil, false)
                }
                continue
            } else {
                // Transport error.
                t.Close()
                return
            }
        }
        switch frame := frame.(type) {
        case *http2.MetaHeadersFrame:
            t.operateHeaders(frame)
        case *http2.DataFrame:
            t.handleData(frame)
        case *http2.RSTStreamFrame:
            t.handleRSTStream(frame)
        case *http2.SettingsFrame:
            t.handleSettings(frame, false)
        case *http2.PingFrame:
            t.handlePing(frame)
        case *http2.GoAwayFrame:
            t.handleGoAway(frame)
        case *http2.WindowUpdateFrame:
            t.handleWindowUpdate(frame)
        default:
            if logger.V(logLevel) {
                logger.Errorf("transport: http2Client.reader got unhandled frame type %v.", frame)
            }
        }
    }
}

这里梳理一下调用流程就是：

开始 ==> withRetry ==> retryLocked ==> newAttemptLocked ==> getTransport ==> pick ==> getReadyTransport ==> connect ==> resetTransport ==> createTransport ==> NewClientTransport ==> newHTTP2Client ==> reader ==> 结束

RecvMsg 方法分析

这里的接收消息也是差不多的流程，可以看成是发送方法的逆过程

func (cs *clientStream) RecvMsg(m interface{}) error {
    // ...省略
    err := cs.withRetry(func(a *csAttempt) error {
        return a.recvMsg(m, recvInfo)
    }, cs.commitAttemptLocked)
}

在 recvMsg 中处理的消息

func (a *csAttempt) recvMsg(m interface{}, payInfo *payloadInfo) (err error) {
    cs := a.cs
    if a.statsHandler != nil && payInfo == nil {
        payInfo = &payloadInfo{}
    }

    if !a.decompSet {
        // Block until we receive headers containing received message encoding.
        if ct := a.s.RecvCompress(); ct != "" && ct != encoding.Identity {
            if a.dc == nil || a.dc.Type() != ct {
                // No configured decompressor, or it does not match the incoming
                // message encoding; attempt to find a registered compressor that does.
                a.dc = nil
                a.decomp = encoding.GetCompressor(ct)
            }
        } else {
            // No compression is used; disable our decompressor.
            a.dc = nil
        }
        // Only initialize this state once per stream.
        a.decompSet = true
    }

    // recv 接收数据 payInfo，未解码的数据放到了结构体的 []byte 中
    err = recv(a.p, cs.codec, a.s, a.dc, m, *cs.callInfo.maxReceiveMessageSize, payInfo, a.decomp)
    if err != nil {
        if err == io.EOF {
            if statusErr := a.s.Status().Err(); statusErr != nil {
                return statusErr
            }
            return io.EOF // indicates successful end of stream.
        }
        return toRPCErr(err)
    }
    if a.trInfo != nil {
        a.mu.Lock()
        if a.trInfo.tr != nil {
            a.trInfo.tr.LazyLog(&payload{sent: false, msg: m}, true)
        }
        a.mu.Unlock()
    }

    // handler 是处理传输层获取到的二进制数据，HandleRPC 其实就是rpc的接收到数据后的处理方法
    if a.statsHandler != nil {
        a.statsHandler.HandleRPC(cs.ctx, &stats.InPayload{
            Client:   true,
            RecvTime: time.Now(),
            Payload:  m,
            // TODO truncate large payload.
            Data:       payInfo.uncompressedBytes,
            WireLength: payInfo.wireLength,
            Length:     len(payInfo.uncompressedBytes),
        })
    }
    if channelz.IsOn() {
        a.t.IncrMsgRecv()
    }
    // 如果这里是服务流模式的rpc，则直接返回了
    if cs.desc.ServerStreams {
        // Subsequent messages should be received by subsequent RecvMsg calls.
        return nil
    }

    // 这里对于非服务流模式的，还需要处理 nil 和 EOF，EOF表示此次接收完成，否则捕捉返回的 err
    // Special handling for non-server-stream rpcs.
    // This recv expects EOF or errors, so we don't collect inPayload.
    err = recv(a.p, cs.codec, a.s, a.dc, m, *cs.callInfo.maxReceiveMessageSize, nil, a.decomp)
    if err == nil {
        return toRPCErr(errors.New("grpc: client streaming protocol violation: get <nil>, want <EOF>"))
    }
    if err == io.EOF {
        return a.s.Status().Err() // non-server streaming Recv returns nil on success
    }
    return toRPCErr(err)
}

在对 transport 句柄中读取的二进制数据主要是这样获取的

func recv(p *parser, c baseCodec, s *transport.Stream, dc Decompressor, m interface{}, maxReceiveMessageSize int, payInfo *payloadInfo, compressor encoding.Compressor) error {
    // 接收和解压缩 
    d, err := recvAndDecompress(p, s, dc, maxReceiveMessageSize, payInfo, compressor)
}

func recvAndDecompress(p *parser, s *transport.Stream, dc Decompressor, maxReceiveMessageSize int, payInfo *payloadInfo, compressor encoding.Compressor) ([]byte, error) {
    // 接收
    pf, d, err := p.recvMsg(maxReceiveMessageSize)
}


// 从stream中读出完整的gRPC消息，返回消息和payload形式，调用者管理返回的消息内存。
// 如果存在错误，可能的错误是，
// io.EOF，当没有消息的时候
// io.ErrUnexpectedEOF
// of type transport.ConnectionError
// 或者status包中定义的错误。
func (p *parser) recvMsg(maxReceiveMessageSize int) (pf payloadFormat, msg []byte, err error) {
    // 请求header头
    if _, err := p.r.Read(p.header[:]); err != nil {
        return 0, nil, err
    }

    pf = payloadFormat(p.header[0])
    // 获取消息体的长度
    length := binary.BigEndian.Uint32(p.header[1:])

    if length == 0 {
        return pf, nil, nil
    }
    if int64(length) > int64(maxInt) {
        return 0, nil, status.Errorf(codes.ResourceExhausted, "grpc: received message larger than max length allowed on current machine (%d vs. %d)", length, maxInt)
    }
    if int(length) > maxReceiveMessageSize {
        return 0, nil, status.Errorf(codes.ResourceExhausted, "grpc: received message larger than max (%d vs. %d)", length, maxReceiveMessageSize)
    }
    // TODO(bradfitz,zhaoq): garbage. reuse buffer after proto decoding instead
    // of making it for each message:
    msg = make([]byte, int(length))
    if _, err := p.r.Read(msg); err != nil {
        if err == io.EOF {
            err = io.ErrUnexpectedEOF
        }
        return 0, nil, err
    }
    return pf, msg, nil
}

至此我们大概可以了解到当一个 rpc 请求的时候，一个 rpc 框架都做了哪些工作。对于 grpc 来说，它的设计侧重点主要是为了通用性，在跨语言的调用中，如何保证调用不发生错乱，所以可以看到它更多的是在传输层上设计了pb编码和压缩，在连接pick的时候怎么在竞态条件下做到不错不乱，并且兼顾一定的并发性能要求。除此之外还开放了需要rpc内核细节的介入接口，方便开发者自定义相关的rpc流程操作，比如负载均衡，编解码，压缩等等。

posted @ 2022-05-31 17:03 Blackbinbin 阅读(269) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Blackbinbin

浅浅的源码剖析grpc-go（一）

公告