k8s-一次生产故障分析——kubelet堆栈和源码分析
一、node 不停重启。变为notreadly,kubelet 进程一直在,但是不打日志。
apiserver上面执行下这个脚本打印堆栈,正常和异常打印对比。192.168.10.81为异常或者正常的nodeip
curl "https://192.168.10.81:10250/debug/pprof/goroutine?debug=2" --cacert /etc/kubernetes/pki/ca.crt --cert /etc/kubernetes/pki/apiserver-kubelet-client.crt --key /etc/kubernetes/pki/apiserver-kubelet-client.key -k > stack.81
堆栈打印结果。
goroutine 268 [select, 50 minutes]: net.(*Resolver).LookupIPAddr(0x5d48420, 0x5c1bc40, 0xc420014070, 0xc422505434, 0xc, 0xc4218400f0, 0x27, 0x0, 0x0, 0xc420daf820) /usr/local/go/src/net/lookup.go:196 +0x52b goroutine 268 [select, 29 minutes]: net.(*Resolver).LookupIPAddr(0x5d48420, 0x5c1bc40, 0xc420014070, 0xc422505434, 0xc, 0xc4218400f0, 0x27, 0x0, 0x0, 0xc420daf820) /usr/local/go/src/net/lookup.go:196 +0x52b
代码:
func (kl *Kubelet) setUpdatedAddressesFromHostname(node *api.Node) { addr := net.ParseIP(kl.hostname) if addr == nil { addrs, err := net.LookupIP(node.Name) #如果没有ip,则需要通过 if err != nil { glog.Errorf("Can't get ip address of node %s, so node addresses will be stale: %v", node.Name, err) return } if len(addrs) == 0 { glog.Errorf("No ip address for node %v, so node addresses will be stale", node.Name) return } // check all ip addresses for this node.Name and try to find the first non-loopback IPv4 address. // If no match is found, it uses the IP of the interface with gateway on it. for _, ip := range addrs { if !ip.IsLoopback() && ip.To4() != nil { addr = ip break } } if addr == nil { ip, err := util.ChooseHostInterface() if err != nil { glog.Errorf("Failed choosing host interface, so node addresses will be stale: %v", err) return } addr = ip } }
在启动参数加上
/usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --pod-manifest-path=/etc/kubernetes/manifests --allow-privileged=true --network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin --cluster-dns=10.96.0.10 --cluster-domain=cluster.local --authorization-mode=Webhook --client-ca-file=/etc/kubernetes/pki/ca.crt --cadvisor-port=0 --cgroup-driver=cgroupfs --rotate-certificates=true --cert-dir=/var/lib/kubelet/pki --node-ip=192.168.10.82
二、其他相关排查,ss-antp 网络排查。
[root@H-LDOCKER-02 ~]# ss -antp |grep 192.168.10.81 ESTAB 0 0 192.168.10.82:49079 192.168.10.81:24007 users:(("glusterfs",pid=126656,fd=11)) ESTAB 0 0 192.168.10.82:48887 192.168.10.81:49152 users:(("glusterfs",pid=30768,fd=37)) ESTAB 0 0 192.168.10.82:49018 192.168.10.81:24007 users:(("glusterd",pid=2762,fd=122)) TIME-WAIT 0 0 192.168.10.82:46448 192.168.10.81:10250 ESTAB 0 0 192.168.10.82:48891 192.168.10.81:49165 users:(("glusterfs",pid=30768,fd=31)) ESTAB 0 0 192.168.10.82:49158 192.168.10.81:49073 users:(("glusterfsd",pid=2929,fd=5)) ESTAB 0 0 192.168.10.82:49044 192.168.10.81:49160 users:(("glusterfs",pid=7469,fd=14)) ESTAB 0 0 192.168.10.82:2380 192.168.10.81:41728 users:(("etcd",pid=65237,fd=240)) ESTAB 0 0 192.168.10.82:57894 192.168.10.81:2379 users:(("kube-apiserver",pid=76134,fd=45)) ESTAB 0 0 192.168.10.82:49100 192.168.10.81:49161 users:(("glusterfs",pid=115740,fd=13)) ESTAB 0 0 192.168.10.82:49087 192.168.10.81:24007 users:(("glusterfs",pid=114131,fd=11)) ESTAB 0 0 192.168.10.82:49047 192.168.10.81:49162 users:(("glusterfs",pid=7460,fd=14)) TIME-WAIT 0 0 192.168.10.82:47558 192.168.10.81:2379 ESTAB 0 0 192.168.10.82:44456 192.168.10.81:10250 users:(("kube-apiserver",pid=76134,fd=93)) ESTAB 0 0 192.168.10.82:57792 192.168.10.81:2379 users:(("kube-apiserver",pid=76134,fd=27)) ESTAB 0 0 192.168.10.82:57702 192.168.10.81:2379 users:(("kube-apiserver",pid=76134,fd=5)) TIME-WAIT 0 0 192.168.10.82:49284 192.168.10.81:2379 ESTAB 0 0 192.168.10.82:49066 192.168.10.81:24007 users:(("glusterfs",pid=123708,fd=11)) ESTAB 0 0 192.168.10.82:49120 192.168.10.81:49157 users:(("glusterfs",pid=114509,fd=14)) ESTAB 0 0 192.168.10.82:49111 192.168.10.81:24007 users:(("glusterfs",pid=55598,fd=11)) ESTAB 0 0 192.168.10.82:57676 192.168.10.81:2379 users:(("kube-apiserver",pid=76134,fd=8)) ESTAB 0 0 192.168.10.82:49059 192.168.10.81:24007 users:(("glusterfs",pid=7469,fd=11)) ESTAB 0 0 192.168.10.82:49110 192.168.10.81:49163 users:(("glusterfs",pid=114643,fd=13)) ESTAB 0 0 192.168.10.82:49121 192.168.10.81:24007 users:(("glusterfs",pid=55040,fd=11)) ESTAB 0 0 192.168.10.82:57856 192.168.10.81:2379 users:(("kube-apiserver",pid=76134,fd=38)) ESTAB 0 0 192.168.10.82:49126 192.168.10.81:24007 users:(("glusterfs",pid=54888,fd=11)) ESTAB 0 0 192.168.10.82:49036 192.168.10.81:49153 users:(("glusterfs",pid=91593,fd=14)) TIME-WAIT 0 0 192.168.10.82:48796 192.168.10.81:2379 TIME-WAIT 0 0 192.168.10.82:47586 192.168.10.81:2379 ESTAB 0 0 ::ffff:192.168.10.80:6443 ::ffff:192.168.10.81:49952 users:(("kube-apiserver",pid=76134,fd=83)) ESTAB 0 0 ::ffff:192.168.10.80:6443 ::ffff:192.168.10.81:43144 users:(("kube-apiserver",pid=76134,fd=103)) ESTAB 0 0 ::ffff:192.168.10.80:6443 ::ffff:192.168.10.81:48902 users:(("kube-apiserver",pid=76134,fd=80))
三、docker 堆栈打印。 yum -y install socat
socat -d -d TCP-LISTEN:18080,fork,bind=172.30.3.102 UNIX:/var/run/docker.sock
四、kubelet cri 学习,https://zhuanlan.zhihu.com/p/87602649
kublet-dockercleint(dockermanager) -----dockerd -containerd-runc
五、namespace.,https://blog.51cto.com/speakingbaicai/1359825?from=groupmessage
六、busybox无法拉取
docker import 导入
良禽择木而栖 贤臣择主而侍