编译 gpushare-device-plugin

编译 gpushare-device-plugin

手动编译可执行文件(探索)

gpushare-device-plugin/Dockerfile 中明确填了编译的命令

# 第一个镜像(编译)
FROM golang:1.10-stretch as build

WORKDIR /go/src/github.com/AliyunContainerService/gpushare-device-plugin
COPY . .

# 编译 gpushare-device-plugin-v2
RUN export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' && \
    go build -ldflags="-s -w" -o /go/bin/gpushare-device-plugin-v2 cmd/nvidia/main.go

# 编译 kubectl-inspect-gpushare-v2
RUN go build -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go

# 第二个镜像(制作)
FROM debian:stretch-slim

# https://www.cnblogs.com/oolo/p/11679733.html#%E4%BD%BF%E7%94%A8-nvidia-docker2
# Device plugin 镜像能够探测到所有的 GPU 设备
ENV NVIDIA_VISIBLE_DEVICES=all
# 只加载 NV Driver 中 nvidia-smi,MPS 等 cli 和 lib,其他的不映射进去
ENV NVIDIA_DRIVER_CAPABILITIES=utility

# 将编译的代码放入容器中
COPY --from=build /go/bin/gpushare-device-plugin-v2 /usr/bin/gpushare-device-plugin-v2

COPY --from=build /go/bin/kubectl-inspect-gpushare-v2 /usr/bin/kubectl-inspect-gpushare-v2

# Device plugin 的入口函数
CMD ["gpushare-device-plugin-v2","-logtostderr"]

编译 gpushare-device-plugin-v2

(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' && \
>     go build -ldflags="-s -w" -o /go/bin/gpushare-device-plugin-v2 cmd/nvidia/main.go
cmd/nvidia/main.go:6:2: cannot find package "github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia" in any of:
	/usr/local/go/src/github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia (from $GOROOT)
	/opt/ml/bin/work/src/github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia

由此可知,我们需要将代码放置到有 github.com/AliyunContainerService 结构的目录下才能编译,于是:

# 创建目录结构
(nni) root@ubuntu:/home/lihao/codes# mkdir -p /opt/ml/bin/work/src/github.com/AliyunContainerService/

# 建立软连接
(nni) root@ubuntu:/home/lihao/codes# ln -s /home/lihao/codes/gpushare-device-plugin /opt/ml/bin/work/src/github.com/AliyunContainerService/

# 再次编译
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' &&     go build -ldflags="-s -w" -o /go/bin/gpushare-device-plugin-v2 cmd/nvidia/main.go
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# ll /go/bin/gpushare-device-plugin-v2
-rwxr-xr-x 1 root root 28353928 10月 16 10:09 /go/bin/gpushare-device-plugin-v2*

编译 kubectl-inspect-gpushare-v2

(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go build -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
cmd/inspect/display.go:11:2: cannot find package "k8s.io/api/core/v1" in any of:
	/usr/local/go/src/k8s.io/api/core/v1 (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/api/core/v1 (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/api/core/v1
cmd/inspect/nodeinfo.go:8:2: cannot find package "k8s.io/apimachinery/pkg/apis/meta/v1" in any of:
	/usr/local/go/src/k8s.io/apimachinery/pkg/apis/meta/v1 (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/apimachinery/pkg/apis/meta/v1 (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/apimachinery/pkg/apis/meta/v1
cmd/inspect/podinfo.go:13:2: cannot find package "k8s.io/apimachinery/pkg/fields" in any of:
	/usr/local/go/src/k8s.io/apimachinery/pkg/fields (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/apimachinery/pkg/fields (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/apimachinery/pkg/fields
cmd/inspect/podinfo.go:14:2: cannot find package "k8s.io/apimachinery/pkg/labels" in any of:
	/usr/local/go/src/k8s.io/apimachinery/pkg/labels (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/apimachinery/pkg/labels (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/apimachinery/pkg/labels
cmd/inspect/podinfo.go:15:2: cannot find package "k8s.io/client-go/kubernetes" in any of:
	/usr/local/go/src/k8s.io/client-go/kubernetes (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/client-go/kubernetes (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/client-go/kubernetes
cmd/inspect/podinfo.go:16:2: cannot find package "k8s.io/client-go/rest" in any of:
	/usr/local/go/src/k8s.io/client-go/rest (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/client-go/rest (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/client-go/rest
cmd/inspect/podinfo.go:17:2: cannot find package "k8s.io/client-go/tools/clientcmd" in any of:
	/usr/local/go/src/k8s.io/client-go/tools/clientcmd (from $GOROOT)
	/opt/ml/bin/work/src/k8s.io/client-go/tools/clientcmd (from $GOPATH)
	/home/lihao/codes/jarvis-http-server/src/k8s.io/client-go/tools/clientcmd

kubectl-inspect-gpushare-v2 依赖的包并不在 GOPATH 中,因此报错,我们可以用 go get XXX 来解决依赖

(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go get k8s.io/api/core/v1
package k8s.io/api/core/v1: unrecognized import path "k8s.io/api/core/v1" (https fetch: Get https://k8s.io/api/core/v1?go-get=1: dial tcp 35.201.71.162:443: connect: connection refused)

k8s.io 被墙了,无法连接,建议用户搭建 http/https 代理,我这里就直接使用了:

(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# export http_proxy=http://lihao:lihao-root@10.10.67.135:3128/
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# export https_proxy=http://lihao:lihao-root@10.10.67.135:3128/

(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go get k8s.io/api/core/v1
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go get k8s.io/client-go/kubernetes
# k8s.io/client-go/transport
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:70:11: cannot convert klog.V(9) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:72:11: cannot convert klog.V(8) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:74:11: cannot convert klog.V(7) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:76:11: cannot convert klog.V(6) (type klog.Verbose) to type bool
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go get k8s.io/client-go/rest
# k8s.io/client-go/transport
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:70:11: cannot convert klog.V(9) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:72:11: cannot convert klog.V(8) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:74:11: cannot convert klog.V(7) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:76:11: cannot convert klog.V(6) (type klog.Verbose) to type bool
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go get k8s.io/client-go/tools/clientcmd
# k8s.io/client-go/transport
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:70:11: cannot convert klog.V(9) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:72:11: cannot convert klog.V(8) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:74:11: cannot convert klog.V(7) (type klog.Verbose) to type bool
/opt/ml/bin/work/src/k8s.io/client-go/transport/round_trippers.go:76:11: cannot convert klog.V(6) (type klog.Verbose) to type bool

遇到了 cannot convert klog.V(9) (type klog.Verbose) to type bool 的错误,只能 google 之,告知的结论是他们依赖的 klog 应该是 0.4.0 版本 而是最新的 1.0.0,于是进入 klog 的路径,进行如下操作:

# 使用 klog 0.4.0 或许有更好的办法,比如 godep,gomodule 暂时先不考虑
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# cd /opt/ml/bin/work/src/k8s.io/klog
(nni) root@ubuntu:/opt/ml/bin/work/src/k8s.io/klog# git checkout -b v0.4.0 v0.4.0
Switched to a new branch 'v0.4.0'

# 重新下载依赖
(nni) root@ubuntu:/# go get k8s.io/client-go/rest
(nni) root@ubuntu:/# go get k8s.io/client-go/tools/clientcmd
(nni) root@ubuntu:/# go get k8s.io/client-go/kubernetes
(nni) root@ubuntu:/# go get k8s.io/api/core/v1

# 编译
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# go build -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
(nni) root@ubuntu:/home/lihao/codes/gpushare-device-plugin# /go/bin/kubectl-inspect-gpushare-v2 --help
Usage of /go/bin/kubectl-inspect-gpushare-v2:
  -alsologtostderr
    	log to standard error as well as files
  -d	details
  -log_backtrace_at value
    	when logging hits line file:N, emit a stack trace
  -log_dir string
    	If non-empty, write log files in this directory
  -logtostderr
    	log to standard error instead of files
  -stderrthreshold value
    	logs at or above this threshold go to stderr
  -v value
    	log level for V logs
  -vmodule value
    	comma-separated list of pattern=N settings for file-filtered logging

使用 vendor 来编译(推荐)

技术在日新月异的发展,上述的编译过程实在是漏洞百出,最核心的一点是,go get XXX 下载的是最新的包,该项目开发的时候用的依赖包版本和我们编译时候的包的版本极大可能会不一致,导致不可知的风险,因此上述的编译方法简直是个灾难。

当我发现 gpushare-device-plugin/vendor 下有该项目完整的依赖时,我就知道,我必须利用 vendor 的依赖来编译,这样才能保证是没有问题的。

为了保持环境的整洁,我重构编译环境,找一台新的没有装过 go 的机器:

[root@localhost ~]# wget https://mirrors.nju.edu.cn/golang/go1.12.1.linux-amd64.tar.gz -O /root/lihao04/
[root@localhost ~]# cd /root/lihao04
[root@localhost lihao04]# tar -zxvf go1.12.1.linux-amd64.tar.gz
[root@localhost lihao04]# export PATH=$PATH:/root/lihao04/go/bin/
[root@localhost lihao04]# go version
go version go1.12.1 linux/amd64

[root@localhost lihao04]# git clone https://github.com/AliyunContainerService/gpushare-device-plugin.git
Cloning into 'gpushare-device-plugin'...
remote: Enumerating objects: 2227, done.
remote: Total 2227 (delta 0), reused 0 (delta 0), pack-reused 2227
Receiving objects: 100% (2227/2227), 4.59 MiB | 447.00 KiB/s, done.
Resolving deltas: 100% (765/765), done.

[root@localhost lihao04]# cd gpushare-device-plugin
[root@localhost gpushare-device-plugin]# go build -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
cmd/inspect/display.go:10:2: cannot find package "github.com/golang/glog" in any of:
	/root/lihao04/go/src/github.com/golang/glog (from $GOROOT)
	/root/go/src/github.com/golang/glog (from $GOPATH)
cmd/inspect/display.go:11:2: cannot find package "k8s.io/api/core/v1" in any of:
	/root/lihao04/go/src/k8s.io/api/core/v1 (from $GOROOT)
	/root/go/src/k8s.io/api/core/v1 (from $GOPATH)
cmd/inspect/nodeinfo.go:8:2: cannot find package "k8s.io/apimachinery/pkg/apis/meta/v1" in any of:
	/root/lihao04/go/src/k8s.io/apimachinery/pkg/apis/meta/v1 (from $GOROOT)
	/root/go/src/k8s.io/apimachinery/pkg/apis/meta/v1 (from $GOPATH)
cmd/inspect/podinfo.go:13:2: cannot find package "k8s.io/apimachinery/pkg/fields" in any of:
	/root/lihao04/go/src/k8s.io/apimachinery/pkg/fields (from $GOROOT)
	/root/go/src/k8s.io/apimachinery/pkg/fields (from $GOPATH)
cmd/inspect/podinfo.go:14:2: cannot find package "k8s.io/apimachinery/pkg/labels" in any of:
	/root/lihao04/go/src/k8s.io/apimachinery/pkg/labels (from $GOROOT)
	/root/go/src/k8s.io/apimachinery/pkg/labels (from $GOPATH)
cmd/inspect/podinfo.go:15:2: cannot find package "k8s.io/client-go/kubernetes" in any of:
	/root/lihao04/go/src/k8s.io/client-go/kubernetes (from $GOROOT)
	/root/go/src/k8s.io/client-go/kubernetes (from $GOPATH)
cmd/inspect/podinfo.go:16:2: cannot find package "k8s.io/client-go/rest" in any of:
	/root/lihao04/go/src/k8s.io/client-go/rest (from $GOROOT)
	/root/go/src/k8s.io/client-go/rest (from $GOPATH)
cmd/inspect/podinfo.go:17:2: cannot find package "k8s.io/client-go/tools/clientcmd" in any of:
	/root/lihao04/go/src/k8s.io/client-go/tools/clientcmd (from $GOROOT)
	/root/go/src/k8s.io/client-go/tools/clientcmd (from $GOPATH)

很明显,从 GOPATH 中找不到依赖,搜索了下,貌似可以通过 export GO111MODULE=on 启用 go modules,这样就不再需要强依赖 GOPATH

[root@localhost gpushare-device-plugin]# export GO111MODULE=on
[root@localhost gpushare-device-plugin]# go build -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
go: creating new go.mod: module github.com/AliyunContainerService/gpushare-device-plugin
go: copying requirements from Gopkg.lock
go: converting Gopkg.lock: stat google.golang.org/genproto@42f80515abfed431577ebeded4ab86390ce0a5cd: unrecognized import path "google.golang.org/genproto" (https fetch: Get https://google.golang.org/genproto?go-get=1: dial tcp 216.239.37.1:443: connect: connection timed out)
go: converting Gopkg.lock: stat k8s.io/apiserver@13cfe3978170675900fbed4994382716c5bd293b: unrecognized import path "k8s.io/apiserver" (https fetch: Get https://k8s.io/apiserver?go-get=1: dial tcp 35.201.71.162:443: connect: connection timed out)
go: converting Gopkg.lock: stat k8s.io/client-go@v8.0.0: unrecognized import path "k8s.io/client-go" (https fetch: Get https://k8s.io/client-go?go-get=1: dial tcp 35.201.71.162:443: connect: connection timed out)
go: converting Gopkg.lock: stat k8s.io/kube-openapi@f442ecb314a3679150c272e2b9713d8deed5955d: unrecognized import path "k8s.io/kube-openapi" (https fetch: Get https://k8s.io/kube-openapi?go-get=1: dial tcp 35.201.71.162:443: connect: connection timed out)
go: converting Gopkg.lock: stat k8s.io/kubernetes@v1.12.3: unrecognized import path "k8s.io/kubernetes" (https fetch: Get https://k8s.io/kubernetes?go-get=1: dial tcp 35.201.71.162:443: connect: connection timed out)
^C

启用了 export GO111MODULE=on 确实有变化,编译中缺少的包会直接去尝试下载,但是这不是我们期待的,可以通过增加 -mod=vendor 来明确使用 vendor 下的依赖:

# 取消 GO111MODULE 设置
[root@localhost gpushare-device-plugin]# unset GO111MODULE

# 编译失败,错误提示告诉我们,使用 vendor 必须启用 GO111MODULE!
[root@localhost gpushare-device-plugin]# go build -mod=vendor -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
build flag -mod=vendor only valid when using modules

# 重新打开 GO111MODULE=on
[root@localhost gpushare-device-plugin]# export GO111MODULE=on

[root@localhost gpushare-device-plugin]# go build -mod=vendor -o /go/bin/kubectl-inspect-gpushare-v2 cmd/inspect/*.go
go: creating new go.mod: module github.com/AliyunContainerService/gpushare-device-plugin
go: copying requirements from Gopkg.lock
go: converting Gopkg.lock: stat github.com/gogo/protobuf@v1.1.1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/json-iterator/go@v1.1.5: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/NVIDIA/nvidia-docker@v1.0.1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/golang/glog@23def4e6c14b4da8ac2ed8007337bc5eb5007998: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat google.golang.org/genproto@42f80515abfed431577ebeded4ab86390ce0a5cd: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/hashicorp/golang-lru@v0.5.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/davecgh/go-spew@v1.1.1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/fsnotify/fsnotify@v1.4.7: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/apiserver@13cfe3978170675900fbed4994382716c5bd293b: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/api@2d6f90ab1293a1fb871cf149423ebb72aa7423aa: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat golang.org/x/time@f51c12702a4d776e4c1fa9b0fabab841babae631: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat golang.org/x/crypto@56440b844dfe139a8ac053f4ecac0b20b79058f4: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat google.golang.org/grpc@v1.16.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/modern-go/concurrent@bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/apiextensions-apiserver@f584b16eb23bd2a3fd292a027d698d95db427c5d: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/peterbourgon/diskv@v2.0.1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat golang.org/x/net@0ed95abb35c445290478a5348a7b38bb154135fd: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/client-go@v8.0.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/NVIDIA/gpu-monitoring-tools@86f2a9fac6c5b597dc494420005144b8ef7ec9fb: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/golang/protobuf@v1.2.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/kube-openapi@f442ecb314a3679150c272e2b9713d8deed5955d: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/gregjones/httpcache@787624de3eb7bd915c329cba748687a3b22666a6: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat golang.org/x/text@v0.3.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat gopkg.in/inf.v0@v0.9.1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/google/gofuzz@24818f796faf91cd76ec7bddd72458fbced7a6c1: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/kubernetes@v1.12.3: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/googleapis/gnostic@v0.2.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/ghodss/yaml@v1.0.0: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/google/btree@4030bb1f1f0c35b30ca7009e9ebd06849dd45306: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/petar/GoLLRB@53be0d36a84c2a886ca057d34b6aa4468df9ccb4: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/modern-go/reflect2@4b7aa43c6742a2c18fdef89dd197aaae7dac7ccd: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat k8s.io/apimachinery@70adfbae261eebb795b76321790745ad0e3c523f: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat gopkg.in/yaml.v2@v2.2.2: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat golang.org/x/sys@98c5dad5d1a0e8a73845ecc8897d0bd56586511d: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/imdario/mergo@v0.3.6: repo version lookup disabled by -mod=vendor
go: converting Gopkg.lock: stat github.com/spf13/pflag@v1.0.3: repo version lookup disabled by -mod=vendor

# kubectl-inspect-gpushare-v2 存在
[root@localhost gpushare-device-plugin]# ll /go/bin/kubectl-inspect-gpushare-v2
-rwxr-xr-x 1 root root 32535344 Oct 16 14:35 /go/bin/kubectl-inspect-gpushare-v2

# 编译 gpushare-device-plugin 也可以成功,注意我们额外增加了 -mod=vendor
[root@localhost gpushare-device-plugin]# export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' &&     go build -ldflags="-s -w" -mod=vendor  -o /go/bin/gpushare-device-plugin-v2 cmd/nvidia/main.go

# gpushare-device-plugin 存在
[root@localhost gpushare-device-plugin]# ll /go/bin/gpushare-device-plugin-v2
-rwxr-xr-x 1 root root 26825912 Oct 16 14:38 /go/bin/gpushare-device-plugin-v2

通过使用 gpushare-device-plugin/vendor 我们指定 -mod=vendor 参数,根本不用我们自己处理依赖,项目的开发人员早就帮我们弄好了,不得不说,太方便了!而且保证一定是正确的!

但是这引发了我的思考:

  1. vendor 目录如何生成?
  2. 如果修改源码,新增依赖项目,应该怎么办?

vendor 目录如何生成?

vendor 是 v1.5 开始引入依赖管理的设计,只是用来存储本地依赖,我暂时的理解是,开发者把所有的依赖放到项目的 vendor 文件夹下,再设置环境变量 enable 该 feature,编译时依赖就走这个 vendor 文件夹,这能弥补多个项目必须通过设置各自的 GOPATH 来解决使用同一依赖但不同版本的情况。

但是如何把所有依赖都挪到 vendor 目录中,GO 语言有一些方案,同时也有一些第三方的方案。

我们查看 gpushare-device-plugin 项目,发现有 Gopkg.lockGopkg.toml 这两个文件:

[root@localhost gpushare-device-plugin]# cat Gopkg.toml

[[constraint]]
  revision = "86f2a9fac6c5b597dc494420005144b8ef7ec9fb"
  name = "github.com/NVIDIA/gpu-monitoring-tools"

[[constraint]]
  name = "k8s.io/kubernetes"
  version = "v1.11.2"

[[constraint]]
  name = "k8s.io/apimachinery"
  branch = "release-1.11"

[[constraint]]
  name = "k8s.io/client-go"
  version = "~v8.0.0"

[[override]]
  name = "k8s.io/api"
  version = "kubernetes-1.11.2"

[[override]]
  name = "github.com/gregjones/httpcache"
  revision = "787624de3eb7bd915c329cba748687a3b22666a6"

[[override]]
  name = "golang.org/x/time"
  revision = "f51c12702a4d776e4c1fa9b0fabab841babae631"

[[override]]
  name = "github.com/docker/docker"
  revision = "4f3616fb1c112e206b88cb7a9922bf49067a7756"

[[override]]
  name = "github.com/docker/distribution"
  revision = "edc3ab29cdff8694dd6feb85cfeb4b5f1b38ed9c"

[prune]
  go-tests = true
  unused-packages = true

我们仔细观察,这里面记录的内容不就是依赖的版本信息么?这个文件一定有特别之处;搜索之,果然有收获。

原来 Gopkg.lockGopkg.toml 这两个文件是 GO Dep 项目的两个主要配置文件,简单讲 Gopkg.toml 是清单文件,Gopkg.lock 是校验描述文件。尽量不要修改,避免造成两个文件不同步的错误。

GO Dep 项目正是官方对依赖高效、简洁地管理提出的解决工具。也就是说,使用 Dep 应该可以生成 vendor 目录?

首先我们需要安装 Dep 工具:

# 安装 go dep 工具
[root@localhost gpushare-device-plugin]# curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5230  100  5230    0     0   9254      0 --:--:-- --:--:-- --:--:--  9289
ARCH = amd64
OS = linux
Will install into /root/go/bin
Fetching https://github.com/golang/dep/releases/latest..
Release Tag = v0.5.4
Fetching https://github.com/golang/dep/releases/tag/v0.5.4..
Fetching https://github.com/golang/dep/releases/download/v0.5.4/dep-linux-amd64..
Setting executable permissions.
Moving executable to /root/go/bin/dep

# 将 /root/go/bin/ 加入 PATH
[root@localhost gpushare-device-plugin]# export PATH=$PATH:/root/go/bin/

# 测试,安装成功
[root@localhost gpushare-device-plugin]# dep
Dep is a tool for managing dependencies for Go projects

Usage: "dep [command]"

Commands:

  init     Set up a new Go project, or migrate an existing one
  status   Report the status of the project's dependencies
  ensure   Ensure a dependency is safely vendored in the project
  version  Show the dep version information
  check    Check if imports, Gopkg.toml, and Gopkg.lock are in sync

Examples:
  dep init                               set up a new project
  dep ensure                             install the project's dependencies
  dep ensure -update                     update the locked versions of all dependencies
  dep ensure -add github.com/pkg/errors  add a dependency to the project

Use "dep help [command]" for more information about a command.

安装好 Dep 后,为了更好的测试,我们将 Gopkg.lockGopkg.tomlvendor 先删除,然后执行 dep init:

# 移除已经做好的依赖
[root@localhost gpushare-device-plugin]# mv Gopkg.lock Gopkg.toml /home/
[root@localhost gpushare-device-plugin]# mv Gopkg.lock Gopkg.toml /home/
[root@localhost gpushare-device-plugin]# mv vendor /home/

# 初始化
[root@localhost gpushare-device-plugin]# dep init
init failed: unable to detect the containing GOPATH: /root/lihao04/gpushare-device-plugin is not within a known GOPATH/src

报错的意思是 dep init 所在的文件夹必须在 GOPATH/src 下,因此需要做调整:

[root@localhost gpushare-device-plugin]# mkdir -p /opt/ml/bin/work/src/github.com/AliyunContainerService/
[root@localhost gpushare-device-plugin]# export GOPATH=/opt/ml/bin/work
[root@localhost gpushare-device-plugin]# ln -s /root/lihao04/gpushare-device-plugin /opt/ml/bin/work/src/github.com/AliyunContainerService/
[root@localhost gpushare-device-plugin]# cd /opt/ml/bin/work/src/github.com/AliyunContainerService/gpushare-device-plugin
[root@localhost gpushare-device-plugin]# dep init
init failed: unable to determine direct dependencies: unable to deduce repository and source type for "golang.org/x/net/context": unable to read metadata: unable to fetch raw metadata: failed HTTP request to URL "http://golang.org/x/net/context?go-get=1": Get http://golang.org/x/net/context?go-get=1: dial tcp 216.239.37.1:80: connect: connection timed out

报错的意思是被墙了,设置代理,并重新执行:

[root@localhost gpushare-device-plugin]# export http_proxy=http://jarvis:jarvis-root@10.191.67.135:3128/
[root@localhost gpushare-device-plugin]# export https_proxy=http://jarvis:jarvis-root@10.191.67.135:3128/

[root@localhost gpushare-device-plugin]# dep init
Importing configuration from godep. These are only initial constraints, and are further refined during the solve process.
Importing configuration from godep. These are only initial constraints, and are further refined during the solve process.
Importing configuration from godep. These are only initial constraints, and are further refined during the solve process.
Importing configuration from godep. These are only initial constraints, and are further refined during the solve process.
  Using master as constraint for direct dep github.com/NVIDIA/gpu-monitoring-tools
  Locking in master (7a750c7) for direct dep github.com/NVIDIA/gpu-monitoring-tools
  Locking in  (0c5108395e2d) for transitive dep github.com/googleapis/gnostic
  Locking in master (548a555) for transitive dep google.golang.org/genproto
  Locking in v0.3.1 (89e63fd) for transitive dep k8s.io/klog
  Locking in  (24818f796faf) for transitive dep github.com/google/gofuzz
  Locking in  (c2654d5206da) for transitive dep k8s.io/utils
  Locking in v1.0.1 (583c0c0) for transitive dep github.com/spf13/pflag
  Locking in v1.1.1 (8991bc2) for transitive dep github.com/davecgh/go-spew
  Locking in v1.3.2 (6c65a55) for transitive dep github.com/golang/protobuf
  Locking in v1.1.0 (fd68e98) for transitive dep sigs.k8s.io/yaml
  Locking in  (ab8a2e0c74be) for transitive dep github.com/json-iterator/go
  Locking in  (b3a7cee44a30) for transitive dep k8s.io/kube-openapi
  Using master as constraint for direct dep github.com/golang/glog
  Locking in master (23def4e) for direct dep github.com/golang/glog
  Locking in  (9f3314589c9a) for transitive dep golang.org/x/oauth2
  Locking in  (1799e75a0719) for direct dep k8s.io/apimachinery
  Using master as constraint for direct dep golang.org/x/net
  Locking in master (da9a3fd) for direct dep golang.org/x/net
  Using ^1.4.7 as constraint for direct dep github.com/fsnotify/fsnotify
  Locking in v1.4.7 (c282820) for direct dep github.com/fsnotify/fsnotify
  Locking in  (f51c12702a4d) for transitive dep golang.org/x/time
  Locking in v2.2.4 (f221b84) for transitive dep gopkg.in/yaml.v2
  Locking in v0.3.2 (342b2e1) for transitive dep golang.org/x/text
  Locking in master (b09406a) for transitive dep golang.org/x/sys
  Locking in  (e84da0312774) for transitive dep golang.org/x/crypto
  Locking in  (7cf5895f2711) for direct dep k8s.io/api
  Locking in 1.0.3 (bacd9c7) for transitive dep github.com/modern-go/concurrent
  Using ^12.0.0 as constraint for direct dep k8s.io/client-go
  Locking in v12.0.0 (78d2af7) for direct dep k8s.io/client-go
  Locking in v0.9.0 (3887ee9) for transitive dep gopkg.in/inf.v0
  Locking in v1.3.1 (5628607) for transitive dep github.com/gogo/protobuf
  Using ^1.24.0 as constraint for direct dep google.golang.org/grpc
  Locking in v1.24.0 (f6d0f9e) for direct dep google.golang.org/grpc
  Using ^1.16.2 as constraint for direct dep k8s.io/kubernetes
  Locking in v1.16.2 (c97fe50) for direct dep k8s.io/kubernetes
  Locking in v1.6.5 (971852b) for transitive dep google.golang.org/appengine
  Locking in v0.3.5 (9316a62) for transitive dep github.com/imdario/mergo
  Locking in v1.0.1 (94122c3) for transitive dep github.com/modern-go/reflect2

看起来执行成功了,我们来看下有什么变化:

先挑选一个貌似已经安装的依赖包 go-spew,看看它被保存在什么位置

[root@localhost gpushare-device-plugin]# find / -type d -name go-spew
# verdor 目录中有了!!!
/root/lihao04/gpushare-device-plugin/vendor/github.com/davecgh/go-spew
# 之前我们备份的,请忽略
/home/vendor/github.com/davecgh/go-spew
# GOPATH 中也保存了一份,查看了下确实不是硬链接,是真实的拷贝
/opt/ml/bin/work/pkg/dep/sources/https---github.com-kubernetes-kubernetes/vendor/github.com/davecgh/go-spew

看下 gpushare-device-plugin 目录下是否多出什么内容

# 发现 gpushare-device-plugin 下多出了 Gopkg.lock 和 Gopkg.toml !!!
[root@localhost gpushare-device-plugin]# ll /root/lihao04/gpushare-device-plugin/
total 52
drwxr-xr-x 4 root root    45 Oct 16 14:27 cmd
drwxr-xr-x 3 root root    30 Oct 16 14:27 demo
-rw-r--r-- 1 root root  1522 Oct 16 14:27 device-plugin-ds.yaml
-rw-r--r-- 1 root root   883 Oct 16 14:27 device-plugin-rbac.yaml
-rw-r--r-- 1 root root   684 Oct 16 14:27 Dockerfile
-rw------- 1 root root    73 Oct 16 15:11 go.mod
-rw-r--r-- 1 root root 13719 Oct 16 15:20 Gopkg.lock
-rw-r--r-- 1 root root  1153 Oct 16 15:20 Gopkg.toml
-rw-r--r-- 1 root root 11357 Oct 16 14:27 LICENSE
drwxr-xr-x 3 root root    24 Oct 16 14:27 pkg
-rw-r--r-- 1 root root   964 Oct 16 14:27 README.md
drwxr-xr-x 8 root root   138 Oct 16 15:20 vendor

至此,真相大白。我们通过实验,得出结论 Gopkg.lockGopkg.tomlvendor 都是由 dep init 自动生成的。

我们还可以对比下新旧 Gopkg.toml

[root@localhost gpushare-device-plugin]# diff /opt/ml/bin/work/src/github.com/AliyunContainerService/gpushare-device-plugin /home/Gopkg.toml
3c3
< # Refer to https://golang.github.io/dep/docs/Gopkg.toml.html
---
> # Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
27d26
<
29c28
<   branch = "master"
---
>   revision = "86f2a9fac6c5b597dc494420005144b8ef7ec9fb"
33,42c32,33
<   name = "github.com/fsnotify/fsnotify"
<   version = "1.4.7"
<
< [[constraint]]
<   branch = "master"
<   name = "github.com/golang/glog"
<
< [[constraint]]
<   branch = "master"
<   name = "golang.org/x/net"
---
>   name = "k8s.io/kubernetes"
>   version = "v1.11.2"
45,46c36,37
<   name = "google.golang.org/grpc"
<   version = "1.24.0"
---
>   name = "k8s.io/apimachinery"
>   branch = "release-1.11"
50c41
<   version = "12.0.0"
---
>   version = "~v8.0.0"
52,54c43,61

...
...

从对比的结果上看,确实依赖库的版本有出入,项目源码中只是调用 import XXX 导入,但是确实没有表明版本,因此在 dep init 默认就用适配的最新的代码。有出入是正常的。

这里有一个小疑问,gpushare-device-plugin 项目如果仅仅发布 Gopkg.lockGopkg.toml 这两个配置文件,而不发布 vendor 我们将怎么下载依赖呢?

# 还原代码
[root@localhost gpushare-device-plugin]# git reset --hard HEAD
HEAD is now at eb657d1 rename config.yml (#10)
[root@localhost gpushare-device-plugin]# git status
# On branch master
# Untracked files:
#   (use "git add <file>..." to include in what will be committed)
#
#	go.mod
nothing added to commit but untracked files present (use "git add" to track)

# 删除所有项目依赖包
[root@localhost gpushare-device-plugin]# rm -rf go.mod
[root@localhost gpushare-device-plugin]# rm -rf /opt/ml/bin/work/*
[root@localhost gpushare-device-plugin]# rm -rf vendor/

# 重新建立目录
[root@localhost gpushare-device-plugin]# mkdir -p /opt/ml/bin/work/src/github.com/AliyunContainerService/
[root@localhost gpushare-device-plugin]# ln -s /root/lihao04/gpushare-device-plugin /opt/ml/bin/work/src/github.com/AliyunContainerService/
[root@localhost gpushare-device-plugin]# cd /opt/ml/bin/work/src/github.com/AliyunContainerService/gpushare-device-plugin

# 执行 dep ensure,耗费时间较长
[root@localhost gpushare-device-plugin]# dep ensure
[root@localhost gpushare-device-plugin]#

# 成功后,发现 vendor 目录等全部存在,Gopkg.toml 不变,但是 Gopkg.lock 有变化
[root@localhost gpushare-device-plugin]# git diff Gopkg.toml
[root@localhost gpushare-device-plugin]# git diff Gopkg.lock
diff --git a/Gopkg.lock b/Gopkg.lock
index 61eb536..3374d0f 100644
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -9,14 +9,6 @@
   revision = "86f2a9fac6c5b597dc494420005144b8ef7ec9fb"

 [[projects]]
-  digest = "1:179992ae7637e3aa978ab9f27e9a32b1d452dc4ae443037a34d5e08aa4ca23e0"
-  name = "github.com/NVIDIA/nvidia-docker"
-  packages = ["src/nvml"]
-  pruneopts = "UT"
-  revision = "01d2c9436620d7dde4672e414698afe6da4a282f"
-  version = "v1.0.1"
-
-[[projects]]
   digest = "1:ffe9824d294da03b391f44e1ae8281281b4afc1bdaa9588c9097785e3af10cec"
   name = "github.com/davecgh/go-spew"
   packages = ["spew"]
@@ -114,17 +106,6 @@
   revision = "787624de3eb7bd915c329cba748687a3b22666a6"

 [[projects]]
-  digest = "1:8ec8d88c248041a6df5f6574b87bc00e7e0b493881dad2e7ef47b11dc69093b5"
-  name = "github.com/hashicorp/golang-lru"
-  packages = [
-    ".",
-    "simplelru",
-  ]
-  pruneopts = "UT"
-  revision = "20f1fb78b0740ba8c3cb143a61e86ba5c8669768"
-  version = "v0.5.0"

我能给出的解释是,Gopkg.toml 是用户的配置,Gopkg.lock 是满足 Gopkg.toml 的生效的、最新的软件包,比如:

[[constraint]]
  name = "k8s.io/kubernetes"
  version = "v1.11.2"

v1.11.2 可能有若干 bug fix 的小版本,这个还是不断地再升级的,因此会有一定的区别???

如果修改源码,新增依赖项目,应该怎么办?

根据上面的操作,我们使用 dep ensure -add 来增加依赖:

## 报错信息说的非常好,代码没有 import 这个库,但是我们临时加上了,但是调用 dep ensure 后,会删除
[root@localhost gpushare-device-plugin]# dep ensure -add github.com/pkg/errors
Fetching sources...

"github.com/pkg/errors" is not imported by your project, and has been temporarily added to Gopkg.lock and vendor/.
If you run "dep ensure" again before actually importing it, it will disappear from Gopkg.lock and vendor/.

懒得给大家演示成功加入一个新的依赖库的输出,总之加入新库后,可以通过 dep status 查看目前的依赖及其版本观察其变化:

[root@localhost gpushare-device-plugin]# dep status
PROJECT                                 CONSTRAINT     VERSION        REVISION  LATEST   PKGS USED
github.com/NVIDIA/gpu-monitoring-tools  branch master  branch master  7a750c7   7a750c7  1
github.com/davecgh/go-spew              v1.1.1         v1.1.1         8991bc2   v1.1.1   1
github.com/fsnotify/fsnotify            ^1.4.7         v1.4.7         c282820   v1.4.7   1
github.com/gogo/protobuf                v1.3.1         v1.3.1         5628607   v1.3.1   4
github.com/golang/glog                  branch master  branch master  23def4e   23def4e  1
github.com/golang/protobuf              v1.3.2         v1.3.2         6c65a55   v1.3.2   5
github.com/google/gofuzz                *                             24818f7            1
github.com/googleapis/gnostic           *                             0c51083            3
github.com/imdario/mergo                v0.3.5         v0.3.5         9316a62   v0.3.5   1
github.com/json-iterator/go             *                             ab8a2e0            1
github.com/modern-go/concurrent         1.0.3          1.0.3          bacd9c7   1.0.3    1
github.com/modern-go/reflect2           v1.0.1         v1.0.1         94122c3   1.0.1    1
github.com/spf13/pflag                  v1.0.1         v1.0.1         583c0c0   v1.0.1   1
golang.org/x/crypto                     *                             e84da03            1
golang.org/x/net                        branch master  branch master  da9a3fd   da9a3fd  8
golang.org/x/oauth2                     *                             9f33145            2
golang.org/x/sys                        branch master  branch master  b09406a   b09406a  2
golang.org/x/text                       v0.3.2         v0.3.2         342b2e1   v0.3.2   16
golang.org/x/time                       *                             f51c127            1
google.golang.org/appengine             v1.6.5         v1.6.5         971852b   v1.6.5   7
google.golang.org/genproto              branch master  branch master  548a555   548a555  1
google.golang.org/grpc                  ^1.24.0        v1.24.0        f6d0f9e   v1.24.0  33
gopkg.in/inf.v0                         v0.9.0         v0.9.0         3887ee9   v0.9.0   1
gopkg.in/yaml.v2                        v2.2.4         v2.2.4         f221b84   v2.2.4   1
k8s.io/api                              *                             7cf5895            36
k8s.io/apimachinery                     *                             1799e75            39
k8s.io/client-go                        ^12.0.0        v12.0.0        78d2af7   v12.0.0  59
k8s.io/klog                             v0.3.1         v0.3.1         89e63fd   v0.3.1   1
k8s.io/kube-openapi                     *                             b3a7cee            1
k8s.io/kubernetes                       ^1.16.2        v1.16.2        c97fe50   v1.16.2  2
k8s.io/utils                            *                             c2654d5            1
sigs.k8s.io/yaml                        v1.1.0         v1.1.0         fd68e98   v1.1.0   1

dep 的最佳实践

  • dep init 能够自动分析 import 的关系,将依赖添加到 Gopkg.lockGopkg.tomlvendor
  • 源码修改后,直接调用 dep ensure 自动更新 Gopkg.lockvendor
  • 如果对依赖包的版本有要求,调用 dep ensure -add github.com/BurntSushi/toml@v0.2.0 指定版本或者其他
  • 使用 dep ensure update <pkg> 进行升级

具体请参考 写Go代码时遇到的那些问题

制作 device plugin 镜像

当编译出来 gpushare-device-plugin-v2kubectl-inspect-gpushare-v2 后,我们修改官方的 Dockerfile 直接制作 docker:

(nni) root@ubuntu:/go/bin# cp /opt/ml/bin/work/src/github.com/AliyunContainerService/gpushare-device-plugin/Dockerfile /go/bin/
(nni) root@ubuntu:/go/bin# cd /go/bin/
(nni) root@ubuntu:/go/bin# docker build -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.12-lihao-test .
Sending build context to Docker daemon  60.92MB
Step 1/6 : FROM debian:stretch-slim
 ---> 9634525e1209
Step 2/6 : ENV NVIDIA_VISIBLE_DEVICES=all
 ---> Using cache
 ---> c6f5ca103b63
Step 3/6 : ENV NVIDIA_DRIVER_CAPABILITIES=utility
 ---> Using cache
 ---> 5b389b7dbabe
Step 4/6 : COPY gpushare-device-plugin-v2 /usr/bin/gpushare-device-plugin-v2
 ---> Using cache
 ---> c8d9175fe5d7
Step 5/6 : COPY kubectl-inspect-gpushare-v2 /usr/bin/kubectl-inspect-gpushare-v2
 ---> e1a8a7b098fd
Step 6/6 : CMD ["gpushare-device-plugin-v2","-logtostderr"]
 ---> Running in b81830470fbd
Removing intermediate container b81830470fbd
 ---> 7fd0dfa09cbc
Successfully built 7fd0dfa09cbc
Successfully tagged registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.12-lihao-test

修改后的 Dockerfile 为:

FROM debian:stretch-slim

ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility

COPY gpushare-device-plugin-v2 /usr/bin/gpushare-device-plugin-v2

COPY kubectl-inspect-gpushare-v2 /usr/bin/kubectl-inspect-gpushare-v2

CMD ["gpushare-device-plugin-v2","-logtostderr"]

测试 device plugin 镜像

登录到装有 gpushare-device-plugin 的 k8s 环境中,修改镜像 v2-1.11-aff8a23 -> v2-1.12-lihao-test

[root@k8s-master kubernetes]# cat device-plugin-ds.yaml|grep image
      - image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.12-lihao-test

查看升级前 device-plugin daemonset 使用的镜像:

[root@k8s-master kubernetes]# kubectl describe pod gpushare-device-plugin-ds-59jzl -n kube-system|grep Image
    Image:         registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-aff8a23
    Image ID:      docker-pullable://registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin@sha256:76769d69f5a5b24cbe117f8ac83a0ff7409fda6108ca982c8f3b8f763e016100

通过 kubectl-inspect-gpushare 可以查看 GPU 显存使用情况

[root@k8s-master kubernetes]# kubectl-inspect-gpushare
NAME                                             IPADDRESS     GPU0(Allocated/Total)  GPU1(Allocated/Total)  GPU2(Allocated/Total)  GPU3(Allocated/Total)  GPU4(Allocated/Total)  GPU5(Allocated/Total)  GPU6(Allocated/Total)  GPU7(Allocated/Total)  GPU Memory(GiB)
mesos-gpu-v100-online020-bdwg.cloud.qiyi.domain  10.39.41.206  13/15                  14/15                  14/15                  14/15                  14/15                  10/15                  0/15                   0/15                   79/120
--------------------------------------------------------------------------------------------------------------------------------------
Allocated/Total GPU Memory In Cluster:
79/120 (65%)

查看升级之前创建的 gpu pod

[root@k8s-master kubernetes]# kubectl get pod|grep binpack
binpack-1-0                 1/1     Running   12         7d1h
binpack-1-1                 1/1     Running   12         7d
binpack-1-2                 1/1     Running   12         7d
binpack-lihao-0             0/1     Pending   0          7d
binpack-lihao-1             1/1     Running   0          22h
binpack-lihao-2             1/1     Running   0          22h
binpack-lihao-3             1/1     Running   0          22h
binpack-lihao-4             1/1     Running   0          22h
binpack-lihao-5             1/1     Running   0          22h
binpack-lihao-6             1/1     Running   0          22h
binpack-lihao-7             1/1     Running   0          22h
binpack-lihao-8             1/1     Running   0          22h
binpack-lihao-9             1/1     Running   0          22h
binpack-test-0              1/1     Running   13         7d

开始升级:

[root@k8s-master kubernetes]# kubectl apply -f device-plugin-ds.yaml
Warning: kubectl apply should be used on resource created by either kubectl create --save-config or kubectl apply
daemonset.extensions/gpushare-device-plugin-ds configured

# 不知道为什么始终不见更新镜像,直接删除该 pod
[root@k8s-master kubernetes]# kubectl delete pod gpushare-device-plugin-ds-59jzl -n kube-system
pod "gpushare-device-plugin-ds-59jzl" deleted

# 新建的 pod 确实使用了指定的镜像
[root@k8s-master kubernetes]# kubectl describe pod gpushare-device-plugin-ds-6b44p -n kube-system|grep Image
    Image:         registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.12-lihao-test
    Image ID:      docker://sha256:08ebf3626df08cf6ed27cfc301a946821b34b8fe574afbb2c2650c826e5589e0

# 新建 gpu pod
[root@k8s-master kubernetes]# kubectl apply -f example-2.yaml
statefulset.apps/binpack-2 created

查看新建的 pod 状态

# 状态正常
[root@k8s-master kubernetes]# kubectl get pod|grep binpack-2
binpack-2-0                 1/1     Running   0          48s
binpack-2-1                 1/1     Running   0          50s

# 正常使用
[root@k8s-master kubernetes]# kubectl exec -it binpack-2-0 bash
root@binpack-2-0:/notebooks# nvidia-smi
Wed Oct 16 10:03:02 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:8C:00.0 Off |                    0 |
| N/A   45C    P0    63W / 300W |  15830MiB / 16130MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+

查看存量 pod 的状态

# 找了一个已经运行了 22h 的存量 GPU pod
[root@k8s-master kubernetes]# kubectl get pod |grep binpack-lihao-1
binpack-lihao-1             1/1     Running   0          22h

# 使用正常
[root@k8s-master kubernetes]# kubectl exec -it binpack-lihao-1 bash
root@binpack-lihao-1:/notebooks# nvidia-smi
Wed Oct 16 10:05:01 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
| N/A   40C    P0    57W / 300W |   9300MiB / 16130MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+
posted on 2019-10-16 17:32  silenceli  阅读(1717)  评论(0编辑  收藏  举报