运维脚本备忘录

(250213,更新脚本)

日常巡检

Shell

#!/bin/bash
# Debian/RHEL/CentOS Check Script

if [ $(id -u) -ne 0 ]; then
  echo "请以 root 用户执行此脚本"
  exit 1
fi

# 获取IP地址
IPADDR=$(ifconfig | grep 'inet ' | awk '{print \$2}' | head -n 1)

# 检测操作系统类型
OS_TYPE=$(cat /etc/*release | grep '^ID=' | cut -d'=' -f2 | tr -d '"')
case "$OS_TYPE" in
  debian)
    OS_NAME="Debian"
    ;;
  centos|rhel)
    OS_NAME="RHEL/CentOS"
    ;;
  *)
    echo "不支持的操作系统: $OS_TYPE"
    exit 1
    ;;
esac

# 获取操作系统版本
if [ -f /etc/os-release ]; then
  osVersion=$(grep '^VERSION_ID=' /etc/os-release | cut -d'=' -f2 | tr -d '"')
  echo "$OS_NAME 版本: ${osVersion:-未能获取版本信息}"
else
  echo "/etc/os-release 文件不存在,无法获取版本信息。"
fi

# 设置变量
RESULTFILE="./log/HostDailyCheck-$IPADDR-$(date +%Y%m%d).txt"
BACKUP_DIR="./backup"
EMAIL="example@mail.com"  # 设置接收邮件的邮箱地址
mkdir -p "$(dirname "$RESULTFILE")"
mkdir -p "$BACKUP_DIR"

function log_section {
  echo -e "\n--------------------- \$1 ---------------------" >> "$RESULTFILE"
}

# 监控服务器的内存、硬盘、CPU、网络状态
function getSystemStatus {
  log_section "系统状态"
  echo "系统: $(uname -o)" >> "$RESULTFILE"
  echo "内核: $(uname -r)" >> "$RESULTFILE"
  echo "主机名: $(hostname)" >> "$RESULTFILE"
  echo "当前时间: $(date +'%F %T')" >> "$RESULTFILE"
}

function getCpuStatus {
  log_section "CPU 状态"
  lscpu >> "$RESULTFILE"
}

function getMemStatus {
  log_section "内存状态"
  free -h >> "$RESULTFILE"
}

function getDiskStatus {
  log_section "磁盘状态"
  df -h >> "$RESULTFILE"
}

function getNetworkStatus {
  log_section "网络状态"
  ip addr >> "$RESULTFILE"
}

# Web 服务器监控
function checkNginxConfig {
  log_section "Nginx 配置检查"
  
  if command -v nginx &> /dev/null; then
    nginx -t >> "$RESULTFILE" 2>&1
    echo "Nginx 配置检查结果已记录" >> "$RESULTFILE"
  else
    echo "Nginx 未安装" >> "$RESULTFILE"
  fi
}

function monitorNginxLogs {
  log_section "Nginx 日志监控"
  
  ACCESS_LOG="/var/log/nginx/access.log"
  ERROR_LOG="/var/log/nginx/error.log"
  
  if [ -f "$ACCESS_LOG" ]; then
    echo "最近的访问日志:" >> "$RESULTFILE"
    tail -n 10 "$ACCESS_LOG" >> "$RESULTFILE"
  else
    echo "Nginx 访问日志不存在" >> "$RESULTFILE"
  fi

  if [ -f "$ERROR_LOG" ]; then
    echo "最近的错误日志:" >> "$RESULTFILE"
    tail -n 10 "$ERROR_LOG" >> "$RESULTFILE"
  else
    echo "Nginx 错误日志不存在" >> "$RESULTFILE"
  fi
}

# 数据库监控
function getMySQLStatus {
  log_section "MySQL 状态"

  if systemctl is-active mysql >> /dev/null 2>&1; then
    echo "MySQL 服务状态: 运行中" >> "$RESULTFILE"
    
    CONNECTIONS=$(mysql -e "SHOW STATUS LIKE 'Threads_connected';" | awk 'NR==2 {print \$2}')
    echo "当前连接数: $CONNECTIONS" >> "$RESULTFILE"
    
    VERSION=$(mysql -V | awk '{print \$5}' | tr -d ',')
    echo "MySQL 版本: $VERSION" >> "$RESULTFILE"

    log_section "最近的查询性能"
    mysql -e "SHOW FULL PROCESSLIST;" >> "$RESULTFILE"
  else
    echo "MySQL 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

function backupMySQL {
  log_section "MySQL 备份"
  
  DB_NAME="your_mysql_database"
  DB_USER="your_mysql_user"
  DB_PASSWORD="your_mysql_password"
  TIMESTAMP=$(date +'%Y%m%d_%H%M%S')

  if mysqldump -u "$DB_USER" -p"$DB_PASSWORD" "$DB_NAME" > "$BACKUP_DIR/${DB_NAME}_backup_$TIMESTAMP.sql"; then
    echo "MySQL 备份成功: ${DB_NAME}_backup_$TIMESTAMP.sql" >> "$RESULTFILE"
  else
    echo "MySQL 备份失败" >> "$RESULTFILE"
  fi
}

function getPostgreSQLStatus {
  log_section "PostgreSQL 状态"

  if systemctl is-active postgresql >> /dev/null 2>&1; then
    echo "PostgreSQL 服务状态: 运行中" >> "$RESULTFILE"
    
    DB_NAME="your_postgres_database"
    DB_USER="your_postgres_user"
    TIMESTAMP=$(date +'%Y%m%d_%H%M%S')

    if command -v psql &> /dev/null; then
      echo "psql 可用" >> "$RESULTFILE"
    else
      echo "psql 未安装或不可用" >> "$RESULTFILE"
    fi
    
    if command -v pg_dump &> /dev/null; then
      echo "pg_dump 可用" >> "$RESULTFILE"
    else
      echo "pg_dump 未安装或不可用" >> "$RESULTFILE"
    fi

    CONNECTIONS=$(psql -U "$DB_USER" -d "$DB_NAME" -c "SELECT COUNT(*) FROM pg_stat_activity;" -t | xargs)
    echo "当前连接数: $CONNECTIONS" >> "$RESULTFILE"
    
    echo -e "\n查询性能:" >> "$RESULTFILE"
    psql -U "$DB_USER" -d "$DB_NAME" -c "SELECT * FROM pg_stat_activity ORDER BY state_change DESC LIMIT 5;" >> "$RESULTFILE"

    # PostgreSQL 备份
    backupPostgreSQL "$DB_NAME" "$DB_USER" "$TIMESTAMP"
    
  else
    echo "PostgreSQL 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

function backupPostgreSQL {
  log_section "PostgreSQL 备份"
  
  DB_NAME="\$1"
  DB_USER="\$2"
  TIMESTAMP="\$3"

  if pg_dump -U "$DB_USER" "$DB_NAME" > "$BACKUP_DIR/${DB_NAME}_backup_$TIMESTAMP.sql"; then
    echo "PostgreSQL 备份成功: ${DB_NAME}_backup_$TIMESTAMP.sql" >> "$RESULTFILE"
  else
    echo "PostgreSQL 备份失败" >> "$RESULTFILE"
  fi
}

# 内存数据库监控
function checkRedisStatus {
  log_section "Redis 状态"

  if systemctl is-active redis >> /dev/null 2>&1; then
    echo "Redis 服务状态: 运行中" >> "$RESULTFILE"
    
    REDIS_INFO=$(redis-cli info)
    echo "Redis 性能信息:" >> "$RESULTFILE"
    echo "$REDIS_INFO" >> "$RESULTFILE"
  else
    echo "Redis 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

# 消息队列监控
function getRabbitMQStatus {
  log_section "RabbitMQ 状态"

  if systemctl is-active rabbitmq-server >> /dev/null 2>&1; then
    echo "RabbitMQ 服务状态: 运行中" >> "$RESULTFILE"
    
    rabbitmqctl list_queues name messages consumers | awk 'NR>1 {print "队列名: "\$1", 消息数: "\$2", 消费者数: "\$3}' >> "$RESULTFILE"
  else
    echo "RabbitMQ 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

# 日志服务监控
function monitorHAProxyLogs {
  log_section "HAProxy 日志监控"

  ACCESS_LOG="/var/log/haproxy.log"

  if [ -f "$ACCESS_LOG" ]; then
    echo "最近的 HAProxy 日志:" >> "$RESULTFILE"
    tail -n 10 "$ACCESS_LOG" >> "$RESULTFILE"
  else
    echo "HAProxy 日志不存在" >> "$RESULTFILE"
  fi
}

# 软件监控
function getNTPStatus {
  log_section "NTP 状态"
  if command -v ntpd &> /dev/null; then
    echo "NTP服务状态: $(systemctl is-active ntpd)" >> "$RESULTFILE"
  elif command -v chronyd &> /dev/null; then
    echo "NTP服务状态: $(systemctl is-active chronyd)" >> "$RESULTFILE"
  else
    echo "NTP服务未安装或未配置" >> "$RESULTFILE"
  fi
}

function getJDKStatus {
  log_section "JDK 状态"
  if command -v java &> /dev/null; then
    java -version >> "$RESULTFILE" 2>&1
  else
    echo "Java 未安装" >> "$RESULTFILE"
  fi
}

function getMavenStatus {
  log_section "Maven 状态"
  if command -v mvn &> /dev/null; then
    mvn -v >> "$RESULTFILE" 2>&1
  else
    echo "Maven 未安装" >> "$RESULTFILE"
  fi
}

function checkPrometheus {
  log_section "Prometheus 状态"

  if systemctl is-active prometheus >> /dev/null 2>&1; then
    echo "Prometheus 服务状态: 运行中" >> "$RESULTFILE"

    PROMETHEUS_URL="http://localhost:9090/metrics"
    if curl -s -o /dev/null -w "%{http_code}" "$PROMETHEUS_URL" | grep -q "200"; then
      echo "Prometheus 指标收集正常" >> "$RESULTFILE"
    else
      echo "Prometheus 指标收集异常" >> "$RESULTFILE"
    fi
  else
    echo "Prometheus 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

function checkGrafana {
  log_section "Grafana 状态"

  if systemctl is-active grafana-server >> /dev/null 2>&1; then
    echo "Grafana 服务状态: 运行中" >> "$RESULTFILE"

    GRAFANA_URL="http://localhost:3000"
    if curl -s -o /dev/null -w "%{http_code}" "$GRAFANA_URL" | grep -q "200"; then
      echo "Grafana 仪表盘可用" >> "$RESULTFILE"
    else
      echo "Grafana 仪表盘不可用" >> "$RESULTFILE"
    fi
  else
    echo "Grafana 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

# CI/CD 监控
function getCIStatus {
  log_section "CI/CD 状态"
  
  if systemctl is-active gitlab >> /dev/null 2>&1; then
    echo "GitLab 服务状态: 运行中" >> "$RESULTFILE"
  else
    echo "GitLab 服务状态: 未运行" >> "$RESULTFILE"
  fi
  
  if systemctl is-active jenkins >> /dev/null 2>&1; then
    echo "Jenkins 服务状态: 运行中" >> "$RESULTFILE"
  else
    echo "Jenkins 服务状态: 未运行" >> "$RESULTFILE"
  fi
}

# 容器监控
function dockerInspection {
  log_section "Docker 状态"

  if command -v docker &> /dev/null; then
    echo "当前运行的 Docker 容器:" >> "$RESULTFILE"
    docker ps >> "$RESULTFILE"
    
    echo -e "\n所有 Docker 容器:" >> "$RESULTFILE"
    docker ps -a >> "$RESULTFILE"
    
    echo -e "\n当前 Docker 镜像:" >> "$RESULTFILE"
    docker images >> "$RESULTFILE"
    
    echo -e "\nDocker 容器资源使用情况:" >> "$RESULTFILE"
    docker stats --no-stream >> "$RESULTFILE"

    echo -e "\nDocker 容器日志:" >> "$RESULTFILE"
    CONTAINERS=$(docker ps -q)
    for CONTAINER in $CONTAINERS; do
      echo -e "\n容器 $CONTAINER 的日志:" >> "$RESULTFILE"
      docker logs "$CONTAINER" --tail 10 >> "$RESULTFILE"
    done

    echo -e "\n当前 Docker 网络配置:" >> "$RESULTFILE"
    docker network ls >> "$RESULTFILE"
  else
    echo "Docker 未安装" >> "$RESULTFILE"
  fi
}

# 容器编排监控
function checkKubernetesStatus {
  log_section "Kubernetes 状态"

  if command -v kubectl &> /dev/null; then
    echo "Kubernetes 集群状态:" >> "$RESULTFILE"
    
    CLUSTER_STATUS=$(kubectl cluster-info)
    echo "$CLUSTER_STATUS" >> "$RESULTFILE"
    
    echo -e "\n节点状态:" >> "$RESULTFILE"
    kubectl get nodes >> "$RESULTFILE"
    
    echo -e "\nPod 状态:" >> "$RESULTFILE"
    kubectl get pods --all-namespaces >> "$RESULTFILE"
  else
    echo "kubectl 未安装" >> "$RESULTFILE"
  fi
}

# 主执行逻辑
{
  getSystemStatus
  getCpuStatus
  getMemStatus
  getDiskStatus
  getNetworkStatus
  checkNginxConfig
  monitorNginxLogs
  getMySQLStatus
  backupMySQL
  getPostgreSQLStatus
  checkRedisStatus
  getRabbitMQStatus
  getNTPStatus
  getJDKStatus
  getMavenStatus
  checkPrometheus
  checkGrafana
  getCIStatus
  dockerInspection
  checkKubernetesStatus
} > "$RESULTFILE" 2>&1

# echo "检查结果已保存到:$RESULTFILE"
# 发送结果到指定邮箱
mail -s "巡检结果 - $IPADDR" "$EMAIL" < "$RESULTFILE"
echo "检查结果已保存到:$RESULTFILE,且已发送到 $EMAIL"

(未修改)

package main

import (
	"bytes"
	"fmt"
	"net/mail"
	"net/smtp"
	"os"
	"os/exec"
	"strings"
	"time"
)

const (
	emailSender   = "your_email@gmail.com" // 发件邮箱
	emailPassword = "your_email_password"    // 发件邮箱密码
	emailReceiver = "example@mail.com"        // 收件邮箱
)

func main() {
	if os.Geteuid() != 0 {
		fmt.Println("请以 root 用户执行此脚本")
		return
	}

	ipAddr, err := getIP()
	if err != nil {
		fmt.Println("获取IP地址失败:", err)
		return
	}

	osName, osVersion, err := getOSInfo()
	if err != nil {
		fmt.Println("获取操作系统信息失败:", err)
		return
	}

	resultFile := fmt.Sprintf("./log/HostDailyCheck-%s-%s.txt", ipAddr, time.Now().Format(""))
	os.MkdirAll("./log", os.ModePerm)
	os.MkdirAll("./backup", os.ModePerm)

	var result bytes.Buffer
	result.WriteString(fmt.Sprintf("IP 地址: %s\n", ipAddr))
	result.WriteString(fmt.Sprintf("%s 版本: %s\n", osName, osVersion))

	result.WriteString(getSystemStatus())
	result.WriteString(getCPUStatus())
	result.WriteString(getMemStatus())
	result.WriteString(getDiskStatus())
	result.WriteString(getNetworkStatus())
	result.WriteString(checkNginxConfig())
	result.WriteString(monitorNginxLogs())
	result.WriteString(getMySQLStatus())
	// Backup and other status checks can be added similarly

	err = os.WriteFile(resultFile, result.Bytes(), 0644)
	if err != nil {
		fmt.Println("写入结果文件失败:", err)
		return
	}

	err = sendEmail("巡检结果 - "+ipAddr, result.String())
	if err != nil {
		fmt.Println("发送邮件失败:", err)
	} else {
		fmt.Println("检查结果已发送到:", emailReceiver)
	}
}

func getIP() (string, error) {
	cmd := exec.Command("ifconfig")
	output, err := cmd.Output()
	if err != nil {
		return "", err
	}
	lines := strings.Split(string(output), "\n")
	for _, line := range lines {
		if strings.Contains(line, "inet ") {
			parts := strings.Fields(line)
			return parts[1], nil
		}
	}
	return "", fmt.Errorf("无法找到 IP 地址")
}

func getOSInfo() (string, string, error) {
	cmd := exec.Command("cat", "/etc/os-release")
	output, err := cmd.Output()
	if err != nil {
		return "", "", err
	}

	var osName, osVersion string
	lines := strings.Split(string(output), "\n")
	for _, line := range lines {
		if strings.HasPrefix(line, "ID=") {
			osName = strings.Trim(strings.Split(line, "=")[1], "\"")
		}
		if strings.HasPrefix(line, "VERSION_ID=") {
			osVersion = strings.Trim(strings.Split(line, "=")[1], "\"")
		}
	}

	if osName == "" || osVersion == "" {
		return "", "", fmt.Errorf("无法获取操作系统信息")
	}

	return osName, osVersion, nil
}

func getSystemStatus() string {
	var result bytes.Buffer
	result.WriteString("系统状态:\n")
	result.WriteString(fmt.Sprintf("系统: %s\n", execOutput("uname", "-o")))
	result.WriteString(fmt.Sprintf("内核: %s\n", execOutput("uname", "-r")))
	result.WriteString(fmt.Sprintf("主机名: %s\n", execOutput("hostname")))
	result.WriteString(fmt.Sprintf("当前时间: %s\n", time.Now().Format(time.RFC1123)))
	return result.String()
}

func getCPUStatus() string {
	return fmt.Sprintf("CPU 状态:\n%s\n", execOutput("lscpu"))
}

func getMemStatus() string {
	return fmt.Sprintf("内存状态:\n%s\n", execOutput("free", "-h"))
}

func getDiskStatus() string {
	return fmt.Sprintf("磁盘状态:\n%s\n", execOutput("df", "-h"))
}

func getNetworkStatus() string {
	return fmt.Sprintf("网络状态:\n%s\n", execOutput("ip", "addr"))
}

func checkNginxConfig() string {
	var result bytes.Buffer
	result.WriteString("Nginx 配置检查:\n")
	if _, err := exec.LookPath("nginx"); err == nil {
		result.WriteString(execOutput("nginx", "-t"))
	} else {
		result.WriteString("Nginx 未安装\n")
	}
	return result.String()
}

func monitorNginxLogs() string {
	var result bytes.Buffer
	result.WriteString("Nginx 日志监控:\n")

	accessLog := "/var/log/nginx/access.log"
	errorLog := "/var/log/nginx/error.log"

	if _, err := os.Stat(accessLog); err == nil {
		result.WriteString("最近的访问日志:\n")
		result.WriteString(execOutput("tail", "-n", "10", accessLog))
	} else {
		result.WriteString("Nginx 访问日志不存在\n")
	}

	if _, err := os.Stat(errorLog); err == nil {
		result.WriteString("最近的错误日志:\n")
		result.WriteString(execOutput("tail", "-n", "10", errorLog))
	} else {
		result.WriteString("Nginx 错误日志不存在\n")
	}

	return result.String()
}

func getMySQLStatus() string {
	var result bytes.Buffer
	result.WriteString("MySQL 状态:\n")

	if cmd := exec.Command("systemctl", "is-active", "mysql"); cmd.Run() == nil {
		result.WriteString("MySQL 服务状态: 运行中\n")
		result.WriteString(execOutput("mysql", "-e", "SHOW STATUS LIKE 'Threads_connected';"))
		result.WriteString(execOutput("mysql", "-V"))
		result.WriteString("最近的查询性能:\n")
		result.WriteString(execOutput("mysql", "-e", "SHOW FULL PROCESSLIST;"))
	} else {
		result.WriteString("MySQL 服务状态: 未运行\n")
	}

	return result.String()
}

func execOutput(command string, args ...string) string {
	cmd := exec.Command(command, args...)
	output, err := cmd.Output()
	if err != nil {
		return fmt.Sprintf("执行命令失败: %s\n", err)
	}
	return string(output)
}

func sendEmail(subject, body string) error {
	auth := smtp.PlainAuth("", emailSender, emailPassword, "smtp.gmail.com")
	to := []string{emailReceiver}
	msg := []byte("To: " + emailReceiver + "\r\n" +
		"Subject: " + subject + "\r\n" +
		"\r\n" +
		body)

	return smtp.SendMail("smtp.gmail.com:587", auth, emailSender, to, msg)
}

Python

import psutil
import subprocess
import logging
import requests
from kubernetes import client, config
import docker
import pymysql
import redis
import pymongo
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import os
# 配置日志
logging.basicConfig(filename='microservice_daily_check.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 钉钉Webhook URL(从环境变量中获取)
DINGTALK_WEBHOOK_URL = os.getenv("DINGTALK_WEBHOOK_URL")
def send_dingtalk_alert(webhook_url, message):
    """
    通过钉钉Webhook发送告警消息
    :param webhook_url: 钉钉机器人的Webhook URL
    :param message: 告警消息内容
    """
    headers = {"Content-Type": "application/json"}
    data = {
        "msgtype": "text",
        "text": {
            "content": message
        }
    }
    try:
        response = requests.post(webhook_url, headers=headers, data=json.dumps(data))
        if response.status_code == 200:
            logging.info("DingTalk alert sent successfully.")
        else:
            logging.error(f"Failed to send DingTalk alert: {response.text}")
    except Exception as e:
        logging.error(f"Error sending DingTalk alert: {str(e)}", exc_info=True)
def log_exception(func):
    """
    装饰器:捕获并记录异常,并发送钉钉告警
    :param func: 被装饰的函数
    """
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            error_message = f"Error in {func.__name__}: {str(e)}"
            logging.error(error_message, exc_info=True)
            if DINGTALK_WEBHOOK_URL:
                send_dingtalk_alert(DINGTALK_WEBHOOK_URL, error_message)
    return wrapper
@log_exception
def check_system_resources():
    """检查系统资源使用情况(CPU、内存、磁盘、网络)"""
    cpu_usage = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    disk_usage = psutil.disk_usage('/')
    network_io = psutil.net_io_counters()
    logging.info(f"CPU Usage: {cpu_usage}%")
    logging.info(f"Memory Usage: {memory_info.percent}%")
    logging.info(f"Disk Usage: {disk_usage.percent}%")
    logging.info(f"Network Sent: {network_io.bytes_sent} bytes, Received: {network_io.bytes_recv} bytes")
@log_exception
def check_docker_containers():
    """检查Docker容器状态"""
    docker_client = docker.from_env()
    containers = docker_client.containers.list()
    for container in containers:
        logging.info(f"Container {container.name} status: {container.status}")
@log_exception
def check_kubernetes_pods():
    """检查Kubernetes Pods状态"""
    config.load_kube_config()  # 加载K8s配置文件
    v1 = client.CoreV1Api()
    pods = v1.list_pod_for_all_namespaces()
    for pod in pods.items:
        logging.info(f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} status: {pod.status.phase}")
@log_exception
def check_microservice_health(url):
    """检查微服务健康状态"""
    response = requests.get(url)
    if response.status_code == 200:
        logging.info(f"Microservice at {url} is healthy.")
    else:
        logging.error(f"Microservice at {url} returned status code {response.status_code}.")
@log_exception
def check_database_status(db_type, **kwargs):
    """检查数据库状态(支持MySQL、Redis、MongoDB)"""
    if db_type == "mysql":
        connection = pymysql.connect(**kwargs)
        with connection.cursor() as cursor:
            cursor.execute("SELECT 1")
            result = cursor.fetchone()
            if result[0] == 1:
                logging.info("MySQL is running.")
            else:
                logging.error("MySQL is not running.")
    elif db_type == "redis":
        r = redis.Redis(**kwargs)
        if r.ping():
            logging.info("Redis is running.")
        else:
            logging.error("Redis is not running.")
    elif db_type == "mongodb":
        client = pymongo.MongoClient(**kwargs)
        if client.server_info():
            logging.info("MongoDB is running.")
        else:
            logging.error("MongoDB is not running.")
@log_exception
def check_rabbitmq_status(host="localhost", port=15672, user="guest", password="guest"):
    """检查RabbitMQ状态"""
    url = f"http://{host}:{port}/api/health"
    response = requests.get(url, auth=(user, password))
    if response.status_code == 200 and response.json().get("status") == "ok":
        logging.info("RabbitMQ is healthy.")
    else:
        logging.error(f"RabbitMQ is not healthy: {response.text}")
@log_exception
def check_kafka_status(broker="localhost:9092"):
    """检查Kafka状态"""
    from kafka import KafkaAdminClient
    admin_client = KafkaAdminClient(bootstrap_servers=broker)
    topics = admin_client.list_topics()
    if topics:
        logging.info("Kafka is running.")
    else:
        logging.error("Kafka has no topics.")
@log_exception
def check_eureka_services(eureka_url="http://localhost:8761/eureka/apps"):
    """检查Eureka注册的服务"""
    response = requests.get(eureka_url)
    if response.status_code == 200:
        apps = response.json().get("applications", {}).get("application", [])
        for app in apps:
            logging.info(f"Service {app['name']} is registered in Eureka.")
    else:
        logging.error(f"Failed to fetch Eureka services: {response.text}")
@log_exception
def check_nginx_status():
    """检查Nginx状态"""
    result = subprocess.run(['systemctl', 'status', 'nginx'], capture_output=True, text=True)
    if "active (running)" in result.stdout:
        logging.info("Nginx is running.")
    else:
        logging.error("Nginx is not running.")
@log_exception
def check_haproxy_status():
    """检查HAProxy状态"""
    result = subprocess.run(['systemctl', 'status', 'haproxy'], capture_output=True, text=True)
    if "active (running)" in result.stdout:
        logging.info("HAProxy is running.")
    else:
        logging.error("HAProxy is not running.")
@log_exception
def check_elk_status(elasticsearch_url="http://localhost:9200", kibana_url="http://localhost:5601"):
    """检查ELK状态(Elasticsearch和Kibana)"""
    es_response = requests.get(elasticsearch_url)
    if es_response.status_code == 200:
        logging.info("Elasticsearch is running.")
    else:
        logging.error(f"Elasticsearch is not running: {es_response.text}")
    kibana_response = requests.get(kibana_url)
    if kibana_response.status_code == 200:
        logging.info("Kibana is running.")
    else:
        logging.error(f"Kibana is not running: {kibana_response.text}")
@log_exception
def check_prometheus_grafana(prometheus_url="http://localhost:9090", grafana_url="http://localhost:3000"):
    """检查Prometheus和Grafana状态"""
    prometheus_response = requests.get(prometheus_url)
    if prometheus_response.status_code == 200:
        logging.info("Prometheus is running.")
    else:
        logging.error(f"Prometheus is not running: {prometheus_response.text}")
    grafana_response = requests.get(grafana_url)
    if grafana_response.status_code == 200:
        logging.info("Grafana is running.")
    else:
        logging.error(f"Grafana is not running: {grafana_response.text}")
@log_exception
def check_jenkins_status(jenkins_url="http://localhost:8080"):
    """检查Jenkins状态"""
    response = requests.get(jenkins_url)
    if response.status_code == 200:
        logging.info("Jenkins is running.")
    else:
        logging.error(f"Jenkins is not running: {response.text}")
@log_exception
def check_gitlab_ci_status(gitlab_url="http://localhost", project_id=1, token="your_token"):
    """检查GitLab CI状态"""
    url = f"{gitlab_url}/api/v4/projects/{project_id}/pipelines"
    headers = {"PRIVATE-TOKEN": token}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        pipelines = response.json()
        if pipelines:
            logging.info(f"GitLab CI pipelines found: {len(pipelines)}")
        else:
            logging.error("No GitLab CI pipelines found.")
    else:
        logging.error(f"Failed to fetch GitLab CI pipelines: {response.text}")
def run_checks_concurrently(checks):
    """
    并发执行检查任务
    :param checks: 需要执行的检查任务列表
    """
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(check) for check in checks]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error in concurrent task: {str(e)}", exc_info=True)
def main():
    """主函数:启动微服务日常巡检"""
    logging.info("Starting microservice daily check...")
    # 定义所有检查任务
    checks = [
        check_system_resources,
        check_docker_containers,
        check_kubernetes_pods,
        partial(check_microservice_health, "http://example.com/health"),
        partial(check_database_status, "mysql", host="localhost", user="root", password="password", database="test"),
        partial(check_database_status, "redis", host="localhost", port=6379),
        partial(check_database_status, "mongodb", host="localhost", port=27017),
        check_rabbitmq_status,
        check_kafka_status,
        check_eureka_services,
        check_nginx_status,
        check_haproxy_status,
        check_elk_status,
        check_prometheus_grafana,
        check_jenkins_status,
        check_gitlab_ci_status,
    ]
    # 并发执行检查任务
    run_checks_concurrently(checks)
    logging.info("Microservice daily check completed.")
if __name__ == "__main__":
    main()

脚本结构说明

  1. 日志配置
    • 使用logging模块记录日志,日志文件为microservice_daily_check.log
  2. 钉钉告警
    • 通过send_dingtalk_alert函数发送告警消息,集成在log_exception装饰器中。
  3. 检查函数
    • 每个检查函数都使用@log_exception装饰器捕获异常并发送告警。
    • 支持检查系统资源、Docker容器、Kubernetes Pods、微服务健康状态、数据库、消息队列、服务发现、负载均衡、日志与监控、CI/CD工具等。
  4. 并发执行
    • 使用ThreadPoolExecutor并发执行检查任务,提高效率。
  5. 主函数
    • 定义所有检查任务并调用run_checks_concurrently并发执行。

https证书以及k8s证书

1. 证书类型

HTTPS 证书

  1. 服务器证书
    • 用于 HTTPS 服务器,如 Nginx、Apache。
    • 包含公钥和私钥,通常由 CA 签发。
  2. 客户端证书
    • 用于客户端身份验证,如双向 TLS(mTLS)。
    • 包含公钥和私钥,通常由 CA 签发。

Kubernetes 集群证书

  1. CA 证书
    • ca.crtca.key:用于签发其他证书。
  2. API Server 证书
    • apiserver.crtapiserver.key:用于 API Server 的 HTTPS 通信。
  3. Kubelet 客户端证书
    • apiserver-kubelet-client.crtapiserver-kubelet-client.key:用于 API Server 与 Kubelet 的通信。
  4. Front Proxy 证书
    • front-proxy-ca.crtfront-proxy-ca.key:用于前端代理的 HTTPS 通信。
  5. Etcd 证书
    • etcd/ca.crtetcd/ca.key:用于 Etcd 的 HTTPS 通信。
  6. Service Account 证书
    • sa.pubsa.key:用于 Service Account 的签名和验证。

2. 创建证书

HTTPS 证书

  1. 使用 OpenSSL 创建自签名证书
    # 生成私钥
    openssl genrsa -out server.key 2048
    # 生成 CSR(证书签名请求)
    openssl req -new -key server.key -out server.csr
    # 自签名证书
    openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
    
  2. 使用 Let's Encrypt 获取免费证书
    sudo apt-get install certbot
    sudo certbot certonly --standalone -d example.com
    

Kubernetes 集群证书

  1. 使用 Kubeadm 创建证书
    • 初始化集群时自动生成证书:
      kubeadm init --pod-network-cidr=10.244.0.0/16
      
    • 手动生成证书:
      kubeadm init phase certs all
      
  2. 使用 OpenSSL 手动创建证书
    • 生成 CA 证书:
      openssl genrsa -out ca.key 2048
      openssl req -x509 -new -nodes -key ca.key -days 365 -out ca.crt
      
    • 生成 API Server 证书:
      openssl genrsa -out apiserver.key 2048
      openssl req -new -key apiserver.key -out apiserver.csr
      openssl x509 -req -in apiserver.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out apiserver.crt -days 365
      

3. 续期证书

HTTPS 证书

  1. 手动续期
    • 使用 OpenSSL 重新生成证书:
      openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
      
    • 使用 Let's Encrypt 续期:
      sudo certbot renew
      
  2. 自动续期
    • 配置 CronJob 定期续期:
      echo "0 0 1 * * /usr/bin/certbot renew" | sudo tee /etc/cron.d/certbot-renew
      

Kubernetes 集群证书

  1. 查看证书有效期
    kubeadm certs check-expiration
    
  2. 续期所有证书
    kubeadm certs renew all
    
  3. 续期单个证书
    kubeadm certs renew apiserver
    
  4. 重启组件
    • 重启 API Server、Controller Manager、Scheduler 和 Kubelet:
      sudo systemctl restart kubelet
      
    • 如果使用静态 Pod 部署控制平面组件,重启 Docker 或 Containerd:
      sudo systemctl restart docker
      

4. 自动化管理

HTTPS 证书

  1. 使用 Cert-Manager
    • 在 Kubernetes 中部署 Cert-Manager,自动管理 Let's Encrypt 证书:
      kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.10.0/cert-manager.yaml
      
    • 创建 Certificate 资源:
      apiVersion: cert-manager.io/v1
      kind: Certificate
      metadata:
        name: example-com
      spec:
        secretName: example-com-tls
        dnsNames:
        - example.com
        issuerRef:
          name: letsencrypt-prod
          kind: ClusterIssuer
      

Kubernetes 集群证书

  1. 配置 Kubeadm 自动续期
    • 修改 Kubeadm 配置文件(如 /etc/kubernetes/kubeadm-config.yaml):
      apiVersion: kubeadm.k8s.io/v1beta3
      kind: ClusterConfiguration
      certificatesDir: /etc/kubernetes/pki
      certificateValidityPeriod: 8760h0m0s  # 1 年
      certificateRenewBefore: 720h0m0s     # 30 天
      
    • 重新生成证书:
      kubeadm init phase certs all --config /etc/kubernetes/kubeadm-config.yaml
      
  2. 使用 CronJob 自动续期
    • 创建一个 CronJob,定期执行 kubeadm certs renew
      apiVersion: batch/v1
      kind: CronJob
      metadata:
        name: kubeadm-cert-renew
      spec:
        schedule: "0 0 1 * *"  # 每月 1 日执行
        jobTemplate:
          spec:
            template:
              spec:
                containers:
                - name: kubeadm-cert-renew
                  image: k8s.gcr.io/kubeadm:v1.22.0
                  command: ["kubeadm", "certs", "renew", "all"]
                restartPolicy: OnFailure
      

5. 注意事项

  1. 证书续期影响
    • 续期证书后,需要重启相关组件,可能导致短暂的服务中断。
  2. 备份证书
    • 在续期或恢复证书前,务必备份现有证书,以防意外。
  3. 集群一致性
    • 在多 Master 集群中,确保所有 Master 节点的证书一致。
posted @   Mugetsukun  阅读(4)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示