运维脚本备忘录
1.Docker备忘录2.Linux备忘录3.nginx备忘录4.ansible备忘录5.PostgreSQL备忘录
6.运维脚本备忘录
7.Redis备忘录8.RabbitMQ备忘录9.ELK(Elasticsearch、Kibana、Filebeat、Metricbeat、Logstash、Elastic Agent、Fleet Server、APM)备忘录10.Keepalived备忘录11.LVS(Linux Virtual Server)备忘录12.bash备忘录13.文本处理备忘录(grep、sed、awk、find)14.数据处理备忘录(rsync、scp)15.进程管理备忘录(supervisor)16.网络服务备忘录(vsftpd、nfs、samba)17.自动化编写备忘录18.Kubernetes架构、部署、应用备忘录(250213,更新脚本)
日常巡检
Shell
#!/bin/bash
# Debian/RHEL/CentOS Check Script
if [ $(id -u) -ne 0 ]; then
echo "请以 root 用户执行此脚本"
exit 1
fi
# 获取IP地址
IPADDR=$(ifconfig | grep 'inet ' | awk '{print \$2}' | head -n 1)
# 检测操作系统类型
OS_TYPE=$(cat /etc/*release | grep '^ID=' | cut -d'=' -f2 | tr -d '"')
case "$OS_TYPE" in
debian)
OS_NAME="Debian"
;;
centos|rhel)
OS_NAME="RHEL/CentOS"
;;
*)
echo "不支持的操作系统: $OS_TYPE"
exit 1
;;
esac
# 获取操作系统版本
if [ -f /etc/os-release ]; then
osVersion=$(grep '^VERSION_ID=' /etc/os-release | cut -d'=' -f2 | tr -d '"')
echo "$OS_NAME 版本: ${osVersion:-未能获取版本信息}"
else
echo "/etc/os-release 文件不存在,无法获取版本信息。"
fi
# 设置变量
RESULTFILE="./log/HostDailyCheck-$IPADDR-$(date +%Y%m%d).txt"
BACKUP_DIR="./backup"
EMAIL="example@mail.com" # 设置接收邮件的邮箱地址
mkdir -p "$(dirname "$RESULTFILE")"
mkdir -p "$BACKUP_DIR"
function log_section {
echo -e "\n--------------------- \$1 ---------------------" >> "$RESULTFILE"
}
# 监控服务器的内存、硬盘、CPU、网络状态
function getSystemStatus {
log_section "系统状态"
echo "系统: $(uname -o)" >> "$RESULTFILE"
echo "内核: $(uname -r)" >> "$RESULTFILE"
echo "主机名: $(hostname)" >> "$RESULTFILE"
echo "当前时间: $(date +'%F %T')" >> "$RESULTFILE"
}
function getCpuStatus {
log_section "CPU 状态"
lscpu >> "$RESULTFILE"
}
function getMemStatus {
log_section "内存状态"
free -h >> "$RESULTFILE"
}
function getDiskStatus {
log_section "磁盘状态"
df -h >> "$RESULTFILE"
}
function getNetworkStatus {
log_section "网络状态"
ip addr >> "$RESULTFILE"
}
# Web 服务器监控
function checkNginxConfig {
log_section "Nginx 配置检查"
if command -v nginx &> /dev/null; then
nginx -t >> "$RESULTFILE" 2>&1
echo "Nginx 配置检查结果已记录" >> "$RESULTFILE"
else
echo "Nginx 未安装" >> "$RESULTFILE"
fi
}
function monitorNginxLogs {
log_section "Nginx 日志监控"
ACCESS_LOG="/var/log/nginx/access.log"
ERROR_LOG="/var/log/nginx/error.log"
if [ -f "$ACCESS_LOG" ]; then
echo "最近的访问日志:" >> "$RESULTFILE"
tail -n 10 "$ACCESS_LOG" >> "$RESULTFILE"
else
echo "Nginx 访问日志不存在" >> "$RESULTFILE"
fi
if [ -f "$ERROR_LOG" ]; then
echo "最近的错误日志:" >> "$RESULTFILE"
tail -n 10 "$ERROR_LOG" >> "$RESULTFILE"
else
echo "Nginx 错误日志不存在" >> "$RESULTFILE"
fi
}
# 数据库监控
function getMySQLStatus {
log_section "MySQL 状态"
if systemctl is-active mysql >> /dev/null 2>&1; then
echo "MySQL 服务状态: 运行中" >> "$RESULTFILE"
CONNECTIONS=$(mysql -e "SHOW STATUS LIKE 'Threads_connected';" | awk 'NR==2 {print \$2}')
echo "当前连接数: $CONNECTIONS" >> "$RESULTFILE"
VERSION=$(mysql -V | awk '{print \$5}' | tr -d ',')
echo "MySQL 版本: $VERSION" >> "$RESULTFILE"
log_section "最近的查询性能"
mysql -e "SHOW FULL PROCESSLIST;" >> "$RESULTFILE"
else
echo "MySQL 服务状态: 未运行" >> "$RESULTFILE"
fi
}
function backupMySQL {
log_section "MySQL 备份"
DB_NAME="your_mysql_database"
DB_USER="your_mysql_user"
DB_PASSWORD="your_mysql_password"
TIMESTAMP=$(date +'%Y%m%d_%H%M%S')
if mysqldump -u "$DB_USER" -p"$DB_PASSWORD" "$DB_NAME" > "$BACKUP_DIR/${DB_NAME}_backup_$TIMESTAMP.sql"; then
echo "MySQL 备份成功: ${DB_NAME}_backup_$TIMESTAMP.sql" >> "$RESULTFILE"
else
echo "MySQL 备份失败" >> "$RESULTFILE"
fi
}
function getPostgreSQLStatus {
log_section "PostgreSQL 状态"
if systemctl is-active postgresql >> /dev/null 2>&1; then
echo "PostgreSQL 服务状态: 运行中" >> "$RESULTFILE"
DB_NAME="your_postgres_database"
DB_USER="your_postgres_user"
TIMESTAMP=$(date +'%Y%m%d_%H%M%S')
if command -v psql &> /dev/null; then
echo "psql 可用" >> "$RESULTFILE"
else
echo "psql 未安装或不可用" >> "$RESULTFILE"
fi
if command -v pg_dump &> /dev/null; then
echo "pg_dump 可用" >> "$RESULTFILE"
else
echo "pg_dump 未安装或不可用" >> "$RESULTFILE"
fi
CONNECTIONS=$(psql -U "$DB_USER" -d "$DB_NAME" -c "SELECT COUNT(*) FROM pg_stat_activity;" -t | xargs)
echo "当前连接数: $CONNECTIONS" >> "$RESULTFILE"
echo -e "\n查询性能:" >> "$RESULTFILE"
psql -U "$DB_USER" -d "$DB_NAME" -c "SELECT * FROM pg_stat_activity ORDER BY state_change DESC LIMIT 5;" >> "$RESULTFILE"
# PostgreSQL 备份
backupPostgreSQL "$DB_NAME" "$DB_USER" "$TIMESTAMP"
else
echo "PostgreSQL 服务状态: 未运行" >> "$RESULTFILE"
fi
}
function backupPostgreSQL {
log_section "PostgreSQL 备份"
DB_NAME="\$1"
DB_USER="\$2"
TIMESTAMP="\$3"
if pg_dump -U "$DB_USER" "$DB_NAME" > "$BACKUP_DIR/${DB_NAME}_backup_$TIMESTAMP.sql"; then
echo "PostgreSQL 备份成功: ${DB_NAME}_backup_$TIMESTAMP.sql" >> "$RESULTFILE"
else
echo "PostgreSQL 备份失败" >> "$RESULTFILE"
fi
}
# 内存数据库监控
function checkRedisStatus {
log_section "Redis 状态"
if systemctl is-active redis >> /dev/null 2>&1; then
echo "Redis 服务状态: 运行中" >> "$RESULTFILE"
REDIS_INFO=$(redis-cli info)
echo "Redis 性能信息:" >> "$RESULTFILE"
echo "$REDIS_INFO" >> "$RESULTFILE"
else
echo "Redis 服务状态: 未运行" >> "$RESULTFILE"
fi
}
# 消息队列监控
function getRabbitMQStatus {
log_section "RabbitMQ 状态"
if systemctl is-active rabbitmq-server >> /dev/null 2>&1; then
echo "RabbitMQ 服务状态: 运行中" >> "$RESULTFILE"
rabbitmqctl list_queues name messages consumers | awk 'NR>1 {print "队列名: "\$1", 消息数: "\$2", 消费者数: "\$3}' >> "$RESULTFILE"
else
echo "RabbitMQ 服务状态: 未运行" >> "$RESULTFILE"
fi
}
# 日志服务监控
function monitorHAProxyLogs {
log_section "HAProxy 日志监控"
ACCESS_LOG="/var/log/haproxy.log"
if [ -f "$ACCESS_LOG" ]; then
echo "最近的 HAProxy 日志:" >> "$RESULTFILE"
tail -n 10 "$ACCESS_LOG" >> "$RESULTFILE"
else
echo "HAProxy 日志不存在" >> "$RESULTFILE"
fi
}
# 软件监控
function getNTPStatus {
log_section "NTP 状态"
if command -v ntpd &> /dev/null; then
echo "NTP服务状态: $(systemctl is-active ntpd)" >> "$RESULTFILE"
elif command -v chronyd &> /dev/null; then
echo "NTP服务状态: $(systemctl is-active chronyd)" >> "$RESULTFILE"
else
echo "NTP服务未安装或未配置" >> "$RESULTFILE"
fi
}
function getJDKStatus {
log_section "JDK 状态"
if command -v java &> /dev/null; then
java -version >> "$RESULTFILE" 2>&1
else
echo "Java 未安装" >> "$RESULTFILE"
fi
}
function getMavenStatus {
log_section "Maven 状态"
if command -v mvn &> /dev/null; then
mvn -v >> "$RESULTFILE" 2>&1
else
echo "Maven 未安装" >> "$RESULTFILE"
fi
}
function checkPrometheus {
log_section "Prometheus 状态"
if systemctl is-active prometheus >> /dev/null 2>&1; then
echo "Prometheus 服务状态: 运行中" >> "$RESULTFILE"
PROMETHEUS_URL="http://localhost:9090/metrics"
if curl -s -o /dev/null -w "%{http_code}" "$PROMETHEUS_URL" | grep -q "200"; then
echo "Prometheus 指标收集正常" >> "$RESULTFILE"
else
echo "Prometheus 指标收集异常" >> "$RESULTFILE"
fi
else
echo "Prometheus 服务状态: 未运行" >> "$RESULTFILE"
fi
}
function checkGrafana {
log_section "Grafana 状态"
if systemctl is-active grafana-server >> /dev/null 2>&1; then
echo "Grafana 服务状态: 运行中" >> "$RESULTFILE"
GRAFANA_URL="http://localhost:3000"
if curl -s -o /dev/null -w "%{http_code}" "$GRAFANA_URL" | grep -q "200"; then
echo "Grafana 仪表盘可用" >> "$RESULTFILE"
else
echo "Grafana 仪表盘不可用" >> "$RESULTFILE"
fi
else
echo "Grafana 服务状态: 未运行" >> "$RESULTFILE"
fi
}
# CI/CD 监控
function getCIStatus {
log_section "CI/CD 状态"
if systemctl is-active gitlab >> /dev/null 2>&1; then
echo "GitLab 服务状态: 运行中" >> "$RESULTFILE"
else
echo "GitLab 服务状态: 未运行" >> "$RESULTFILE"
fi
if systemctl is-active jenkins >> /dev/null 2>&1; then
echo "Jenkins 服务状态: 运行中" >> "$RESULTFILE"
else
echo "Jenkins 服务状态: 未运行" >> "$RESULTFILE"
fi
}
# 容器监控
function dockerInspection {
log_section "Docker 状态"
if command -v docker &> /dev/null; then
echo "当前运行的 Docker 容器:" >> "$RESULTFILE"
docker ps >> "$RESULTFILE"
echo -e "\n所有 Docker 容器:" >> "$RESULTFILE"
docker ps -a >> "$RESULTFILE"
echo -e "\n当前 Docker 镜像:" >> "$RESULTFILE"
docker images >> "$RESULTFILE"
echo -e "\nDocker 容器资源使用情况:" >> "$RESULTFILE"
docker stats --no-stream >> "$RESULTFILE"
echo -e "\nDocker 容器日志:" >> "$RESULTFILE"
CONTAINERS=$(docker ps -q)
for CONTAINER in $CONTAINERS; do
echo -e "\n容器 $CONTAINER 的日志:" >> "$RESULTFILE"
docker logs "$CONTAINER" --tail 10 >> "$RESULTFILE"
done
echo -e "\n当前 Docker 网络配置:" >> "$RESULTFILE"
docker network ls >> "$RESULTFILE"
else
echo "Docker 未安装" >> "$RESULTFILE"
fi
}
# 容器编排监控
function checkKubernetesStatus {
log_section "Kubernetes 状态"
if command -v kubectl &> /dev/null; then
echo "Kubernetes 集群状态:" >> "$RESULTFILE"
CLUSTER_STATUS=$(kubectl cluster-info)
echo "$CLUSTER_STATUS" >> "$RESULTFILE"
echo -e "\n节点状态:" >> "$RESULTFILE"
kubectl get nodes >> "$RESULTFILE"
echo -e "\nPod 状态:" >> "$RESULTFILE"
kubectl get pods --all-namespaces >> "$RESULTFILE"
else
echo "kubectl 未安装" >> "$RESULTFILE"
fi
}
# 主执行逻辑
{
getSystemStatus
getCpuStatus
getMemStatus
getDiskStatus
getNetworkStatus
checkNginxConfig
monitorNginxLogs
getMySQLStatus
backupMySQL
getPostgreSQLStatus
checkRedisStatus
getRabbitMQStatus
getNTPStatus
getJDKStatus
getMavenStatus
checkPrometheus
checkGrafana
getCIStatus
dockerInspection
checkKubernetesStatus
} > "$RESULTFILE" 2>&1
# echo "检查结果已保存到:$RESULTFILE"
# 发送结果到指定邮箱
mail -s "巡检结果 - $IPADDR" "$EMAIL" < "$RESULTFILE"
echo "检查结果已保存到:$RESULTFILE,且已发送到 $EMAIL"
(未修改)
package main
import (
"bytes"
"fmt"
"net/mail"
"net/smtp"
"os"
"os/exec"
"strings"
"time"
)
const (
emailSender = "your_email@gmail.com" // 发件邮箱
emailPassword = "your_email_password" // 发件邮箱密码
emailReceiver = "example@mail.com" // 收件邮箱
)
func main() {
if os.Geteuid() != 0 {
fmt.Println("请以 root 用户执行此脚本")
return
}
ipAddr, err := getIP()
if err != nil {
fmt.Println("获取IP地址失败:", err)
return
}
osName, osVersion, err := getOSInfo()
if err != nil {
fmt.Println("获取操作系统信息失败:", err)
return
}
resultFile := fmt.Sprintf("./log/HostDailyCheck-%s-%s.txt", ipAddr, time.Now().Format(""))
os.MkdirAll("./log", os.ModePerm)
os.MkdirAll("./backup", os.ModePerm)
var result bytes.Buffer
result.WriteString(fmt.Sprintf("IP 地址: %s\n", ipAddr))
result.WriteString(fmt.Sprintf("%s 版本: %s\n", osName, osVersion))
result.WriteString(getSystemStatus())
result.WriteString(getCPUStatus())
result.WriteString(getMemStatus())
result.WriteString(getDiskStatus())
result.WriteString(getNetworkStatus())
result.WriteString(checkNginxConfig())
result.WriteString(monitorNginxLogs())
result.WriteString(getMySQLStatus())
// Backup and other status checks can be added similarly
err = os.WriteFile(resultFile, result.Bytes(), 0644)
if err != nil {
fmt.Println("写入结果文件失败:", err)
return
}
err = sendEmail("巡检结果 - "+ipAddr, result.String())
if err != nil {
fmt.Println("发送邮件失败:", err)
} else {
fmt.Println("检查结果已发送到:", emailReceiver)
}
}
func getIP() (string, error) {
cmd := exec.Command("ifconfig")
output, err := cmd.Output()
if err != nil {
return "", err
}
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if strings.Contains(line, "inet ") {
parts := strings.Fields(line)
return parts[1], nil
}
}
return "", fmt.Errorf("无法找到 IP 地址")
}
func getOSInfo() (string, string, error) {
cmd := exec.Command("cat", "/etc/os-release")
output, err := cmd.Output()
if err != nil {
return "", "", err
}
var osName, osVersion string
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if strings.HasPrefix(line, "ID=") {
osName = strings.Trim(strings.Split(line, "=")[1], "\"")
}
if strings.HasPrefix(line, "VERSION_ID=") {
osVersion = strings.Trim(strings.Split(line, "=")[1], "\"")
}
}
if osName == "" || osVersion == "" {
return "", "", fmt.Errorf("无法获取操作系统信息")
}
return osName, osVersion, nil
}
func getSystemStatus() string {
var result bytes.Buffer
result.WriteString("系统状态:\n")
result.WriteString(fmt.Sprintf("系统: %s\n", execOutput("uname", "-o")))
result.WriteString(fmt.Sprintf("内核: %s\n", execOutput("uname", "-r")))
result.WriteString(fmt.Sprintf("主机名: %s\n", execOutput("hostname")))
result.WriteString(fmt.Sprintf("当前时间: %s\n", time.Now().Format(time.RFC1123)))
return result.String()
}
func getCPUStatus() string {
return fmt.Sprintf("CPU 状态:\n%s\n", execOutput("lscpu"))
}
func getMemStatus() string {
return fmt.Sprintf("内存状态:\n%s\n", execOutput("free", "-h"))
}
func getDiskStatus() string {
return fmt.Sprintf("磁盘状态:\n%s\n", execOutput("df", "-h"))
}
func getNetworkStatus() string {
return fmt.Sprintf("网络状态:\n%s\n", execOutput("ip", "addr"))
}
func checkNginxConfig() string {
var result bytes.Buffer
result.WriteString("Nginx 配置检查:\n")
if _, err := exec.LookPath("nginx"); err == nil {
result.WriteString(execOutput("nginx", "-t"))
} else {
result.WriteString("Nginx 未安装\n")
}
return result.String()
}
func monitorNginxLogs() string {
var result bytes.Buffer
result.WriteString("Nginx 日志监控:\n")
accessLog := "/var/log/nginx/access.log"
errorLog := "/var/log/nginx/error.log"
if _, err := os.Stat(accessLog); err == nil {
result.WriteString("最近的访问日志:\n")
result.WriteString(execOutput("tail", "-n", "10", accessLog))
} else {
result.WriteString("Nginx 访问日志不存在\n")
}
if _, err := os.Stat(errorLog); err == nil {
result.WriteString("最近的错误日志:\n")
result.WriteString(execOutput("tail", "-n", "10", errorLog))
} else {
result.WriteString("Nginx 错误日志不存在\n")
}
return result.String()
}
func getMySQLStatus() string {
var result bytes.Buffer
result.WriteString("MySQL 状态:\n")
if cmd := exec.Command("systemctl", "is-active", "mysql"); cmd.Run() == nil {
result.WriteString("MySQL 服务状态: 运行中\n")
result.WriteString(execOutput("mysql", "-e", "SHOW STATUS LIKE 'Threads_connected';"))
result.WriteString(execOutput("mysql", "-V"))
result.WriteString("最近的查询性能:\n")
result.WriteString(execOutput("mysql", "-e", "SHOW FULL PROCESSLIST;"))
} else {
result.WriteString("MySQL 服务状态: 未运行\n")
}
return result.String()
}
func execOutput(command string, args ...string) string {
cmd := exec.Command(command, args...)
output, err := cmd.Output()
if err != nil {
return fmt.Sprintf("执行命令失败: %s\n", err)
}
return string(output)
}
func sendEmail(subject, body string) error {
auth := smtp.PlainAuth("", emailSender, emailPassword, "smtp.gmail.com")
to := []string{emailReceiver}
msg := []byte("To: " + emailReceiver + "\r\n" +
"Subject: " + subject + "\r\n" +
"\r\n" +
body)
return smtp.SendMail("smtp.gmail.com:587", auth, emailSender, to, msg)
}
Python
import psutil
import subprocess
import logging
import requests
from kubernetes import client, config
import docker
import pymysql
import redis
import pymongo
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import os
# 配置日志
logging.basicConfig(filename='microservice_daily_check.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 钉钉Webhook URL(从环境变量中获取)
DINGTALK_WEBHOOK_URL = os.getenv("DINGTALK_WEBHOOK_URL")
def send_dingtalk_alert(webhook_url, message):
"""
通过钉钉Webhook发送告警消息
:param webhook_url: 钉钉机器人的Webhook URL
:param message: 告警消息内容
"""
headers = {"Content-Type": "application/json"}
data = {
"msgtype": "text",
"text": {
"content": message
}
}
try:
response = requests.post(webhook_url, headers=headers, data=json.dumps(data))
if response.status_code == 200:
logging.info("DingTalk alert sent successfully.")
else:
logging.error(f"Failed to send DingTalk alert: {response.text}")
except Exception as e:
logging.error(f"Error sending DingTalk alert: {str(e)}", exc_info=True)
def log_exception(func):
"""
装饰器:捕获并记录异常,并发送钉钉告警
:param func: 被装饰的函数
"""
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
error_message = f"Error in {func.__name__}: {str(e)}"
logging.error(error_message, exc_info=True)
if DINGTALK_WEBHOOK_URL:
send_dingtalk_alert(DINGTALK_WEBHOOK_URL, error_message)
return wrapper
@log_exception
def check_system_resources():
"""检查系统资源使用情况(CPU、内存、磁盘、网络)"""
cpu_usage = psutil.cpu_percent(interval=1)
memory_info = psutil.virtual_memory()
disk_usage = psutil.disk_usage('/')
network_io = psutil.net_io_counters()
logging.info(f"CPU Usage: {cpu_usage}%")
logging.info(f"Memory Usage: {memory_info.percent}%")
logging.info(f"Disk Usage: {disk_usage.percent}%")
logging.info(f"Network Sent: {network_io.bytes_sent} bytes, Received: {network_io.bytes_recv} bytes")
@log_exception
def check_docker_containers():
"""检查Docker容器状态"""
docker_client = docker.from_env()
containers = docker_client.containers.list()
for container in containers:
logging.info(f"Container {container.name} status: {container.status}")
@log_exception
def check_kubernetes_pods():
"""检查Kubernetes Pods状态"""
config.load_kube_config() # 加载K8s配置文件
v1 = client.CoreV1Api()
pods = v1.list_pod_for_all_namespaces()
for pod in pods.items:
logging.info(f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} status: {pod.status.phase}")
@log_exception
def check_microservice_health(url):
"""检查微服务健康状态"""
response = requests.get(url)
if response.status_code == 200:
logging.info(f"Microservice at {url} is healthy.")
else:
logging.error(f"Microservice at {url} returned status code {response.status_code}.")
@log_exception
def check_database_status(db_type, **kwargs):
"""检查数据库状态(支持MySQL、Redis、MongoDB)"""
if db_type == "mysql":
connection = pymysql.connect(**kwargs)
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
result = cursor.fetchone()
if result[0] == 1:
logging.info("MySQL is running.")
else:
logging.error("MySQL is not running.")
elif db_type == "redis":
r = redis.Redis(**kwargs)
if r.ping():
logging.info("Redis is running.")
else:
logging.error("Redis is not running.")
elif db_type == "mongodb":
client = pymongo.MongoClient(**kwargs)
if client.server_info():
logging.info("MongoDB is running.")
else:
logging.error("MongoDB is not running.")
@log_exception
def check_rabbitmq_status(host="localhost", port=15672, user="guest", password="guest"):
"""检查RabbitMQ状态"""
url = f"http://{host}:{port}/api/health"
response = requests.get(url, auth=(user, password))
if response.status_code == 200 and response.json().get("status") == "ok":
logging.info("RabbitMQ is healthy.")
else:
logging.error(f"RabbitMQ is not healthy: {response.text}")
@log_exception
def check_kafka_status(broker="localhost:9092"):
"""检查Kafka状态"""
from kafka import KafkaAdminClient
admin_client = KafkaAdminClient(bootstrap_servers=broker)
topics = admin_client.list_topics()
if topics:
logging.info("Kafka is running.")
else:
logging.error("Kafka has no topics.")
@log_exception
def check_eureka_services(eureka_url="http://localhost:8761/eureka/apps"):
"""检查Eureka注册的服务"""
response = requests.get(eureka_url)
if response.status_code == 200:
apps = response.json().get("applications", {}).get("application", [])
for app in apps:
logging.info(f"Service {app['name']} is registered in Eureka.")
else:
logging.error(f"Failed to fetch Eureka services: {response.text}")
@log_exception
def check_nginx_status():
"""检查Nginx状态"""
result = subprocess.run(['systemctl', 'status', 'nginx'], capture_output=True, text=True)
if "active (running)" in result.stdout:
logging.info("Nginx is running.")
else:
logging.error("Nginx is not running.")
@log_exception
def check_haproxy_status():
"""检查HAProxy状态"""
result = subprocess.run(['systemctl', 'status', 'haproxy'], capture_output=True, text=True)
if "active (running)" in result.stdout:
logging.info("HAProxy is running.")
else:
logging.error("HAProxy is not running.")
@log_exception
def check_elk_status(elasticsearch_url="http://localhost:9200", kibana_url="http://localhost:5601"):
"""检查ELK状态(Elasticsearch和Kibana)"""
es_response = requests.get(elasticsearch_url)
if es_response.status_code == 200:
logging.info("Elasticsearch is running.")
else:
logging.error(f"Elasticsearch is not running: {es_response.text}")
kibana_response = requests.get(kibana_url)
if kibana_response.status_code == 200:
logging.info("Kibana is running.")
else:
logging.error(f"Kibana is not running: {kibana_response.text}")
@log_exception
def check_prometheus_grafana(prometheus_url="http://localhost:9090", grafana_url="http://localhost:3000"):
"""检查Prometheus和Grafana状态"""
prometheus_response = requests.get(prometheus_url)
if prometheus_response.status_code == 200:
logging.info("Prometheus is running.")
else:
logging.error(f"Prometheus is not running: {prometheus_response.text}")
grafana_response = requests.get(grafana_url)
if grafana_response.status_code == 200:
logging.info("Grafana is running.")
else:
logging.error(f"Grafana is not running: {grafana_response.text}")
@log_exception
def check_jenkins_status(jenkins_url="http://localhost:8080"):
"""检查Jenkins状态"""
response = requests.get(jenkins_url)
if response.status_code == 200:
logging.info("Jenkins is running.")
else:
logging.error(f"Jenkins is not running: {response.text}")
@log_exception
def check_gitlab_ci_status(gitlab_url="http://localhost", project_id=1, token="your_token"):
"""检查GitLab CI状态"""
url = f"{gitlab_url}/api/v4/projects/{project_id}/pipelines"
headers = {"PRIVATE-TOKEN": token}
response = requests.get(url, headers=headers)
if response.status_code == 200:
pipelines = response.json()
if pipelines:
logging.info(f"GitLab CI pipelines found: {len(pipelines)}")
else:
logging.error("No GitLab CI pipelines found.")
else:
logging.error(f"Failed to fetch GitLab CI pipelines: {response.text}")
def run_checks_concurrently(checks):
"""
并发执行检查任务
:param checks: 需要执行的检查任务列表
"""
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(check) for check in checks]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logging.error(f"Error in concurrent task: {str(e)}", exc_info=True)
def main():
"""主函数:启动微服务日常巡检"""
logging.info("Starting microservice daily check...")
# 定义所有检查任务
checks = [
check_system_resources,
check_docker_containers,
check_kubernetes_pods,
partial(check_microservice_health, "http://example.com/health"),
partial(check_database_status, "mysql", host="localhost", user="root", password="password", database="test"),
partial(check_database_status, "redis", host="localhost", port=6379),
partial(check_database_status, "mongodb", host="localhost", port=27017),
check_rabbitmq_status,
check_kafka_status,
check_eureka_services,
check_nginx_status,
check_haproxy_status,
check_elk_status,
check_prometheus_grafana,
check_jenkins_status,
check_gitlab_ci_status,
]
# 并发执行检查任务
run_checks_concurrently(checks)
logging.info("Microservice daily check completed.")
if __name__ == "__main__":
main()
脚本结构说明
- 日志配置:
- 使用
logging
模块记录日志,日志文件为microservice_daily_check.log
。
- 使用
- 钉钉告警:
- 通过
send_dingtalk_alert
函数发送告警消息,集成在log_exception
装饰器中。
- 通过
- 检查函数:
- 每个检查函数都使用
@log_exception
装饰器捕获异常并发送告警。 - 支持检查系统资源、Docker容器、Kubernetes Pods、微服务健康状态、数据库、消息队列、服务发现、负载均衡、日志与监控、CI/CD工具等。
- 每个检查函数都使用
- 并发执行:
- 使用
ThreadPoolExecutor
并发执行检查任务,提高效率。
- 使用
- 主函数:
- 定义所有检查任务并调用
run_checks_concurrently
并发执行。
- 定义所有检查任务并调用
https证书以及k8s证书
1. 证书类型
HTTPS 证书
- 服务器证书:
- 用于 HTTPS 服务器,如 Nginx、Apache。
- 包含公钥和私钥,通常由 CA 签发。
- 客户端证书:
- 用于客户端身份验证,如双向 TLS(mTLS)。
- 包含公钥和私钥,通常由 CA 签发。
Kubernetes 集群证书
- CA 证书:
ca.crt
和ca.key
:用于签发其他证书。
- API Server 证书:
apiserver.crt
和apiserver.key
:用于 API Server 的 HTTPS 通信。
- Kubelet 客户端证书:
apiserver-kubelet-client.crt
和apiserver-kubelet-client.key
:用于 API Server 与 Kubelet 的通信。
- Front Proxy 证书:
front-proxy-ca.crt
和front-proxy-ca.key
:用于前端代理的 HTTPS 通信。
- Etcd 证书:
etcd/ca.crt
和etcd/ca.key
:用于 Etcd 的 HTTPS 通信。
- Service Account 证书:
sa.pub
和sa.key
:用于 Service Account 的签名和验证。
2. 创建证书
HTTPS 证书
- 使用 OpenSSL 创建自签名证书:
# 生成私钥 openssl genrsa -out server.key 2048 # 生成 CSR(证书签名请求) openssl req -new -key server.key -out server.csr # 自签名证书 openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
- 使用 Let's Encrypt 获取免费证书:
sudo apt-get install certbot sudo certbot certonly --standalone -d example.com
Kubernetes 集群证书
- 使用 Kubeadm 创建证书:
- 初始化集群时自动生成证书:
kubeadm init --pod-network-cidr=10.244.0.0/16
- 手动生成证书:
kubeadm init phase certs all
- 初始化集群时自动生成证书:
- 使用 OpenSSL 手动创建证书:
- 生成 CA 证书:
openssl genrsa -out ca.key 2048 openssl req -x509 -new -nodes -key ca.key -days 365 -out ca.crt
- 生成 API Server 证书:
openssl genrsa -out apiserver.key 2048 openssl req -new -key apiserver.key -out apiserver.csr openssl x509 -req -in apiserver.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out apiserver.crt -days 365
- 生成 CA 证书:
3. 续期证书
HTTPS 证书
- 手动续期:
- 使用 OpenSSL 重新生成证书:
openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
- 使用 Let's Encrypt 续期:
sudo certbot renew
- 使用 OpenSSL 重新生成证书:
- 自动续期:
- 配置 CronJob 定期续期:
echo "0 0 1 * * /usr/bin/certbot renew" | sudo tee /etc/cron.d/certbot-renew
- 配置 CronJob 定期续期:
Kubernetes 集群证书
- 查看证书有效期:
kubeadm certs check-expiration
- 续期所有证书:
kubeadm certs renew all
- 续期单个证书:
kubeadm certs renew apiserver
- 重启组件:
- 重启 API Server、Controller Manager、Scheduler 和 Kubelet:
sudo systemctl restart kubelet
- 如果使用静态 Pod 部署控制平面组件,重启 Docker 或 Containerd:
sudo systemctl restart docker
- 重启 API Server、Controller Manager、Scheduler 和 Kubelet:
4. 自动化管理
HTTPS 证书
- 使用 Cert-Manager:
- 在 Kubernetes 中部署 Cert-Manager,自动管理 Let's Encrypt 证书:
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.10.0/cert-manager.yaml
- 创建 Certificate 资源:
apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: example-com spec: secretName: example-com-tls dnsNames: - example.com issuerRef: name: letsencrypt-prod kind: ClusterIssuer
- 在 Kubernetes 中部署 Cert-Manager,自动管理 Let's Encrypt 证书:
Kubernetes 集群证书
- 配置 Kubeadm 自动续期:
- 修改 Kubeadm 配置文件(如
/etc/kubernetes/kubeadm-config.yaml
):apiVersion: kubeadm.k8s.io/v1beta3 kind: ClusterConfiguration certificatesDir: /etc/kubernetes/pki certificateValidityPeriod: 8760h0m0s # 1 年 certificateRenewBefore: 720h0m0s # 30 天
- 重新生成证书:
kubeadm init phase certs all --config /etc/kubernetes/kubeadm-config.yaml
- 修改 Kubeadm 配置文件(如
- 使用 CronJob 自动续期:
- 创建一个 CronJob,定期执行
kubeadm certs renew
:apiVersion: batch/v1 kind: CronJob metadata: name: kubeadm-cert-renew spec: schedule: "0 0 1 * *" # 每月 1 日执行 jobTemplate: spec: template: spec: containers: - name: kubeadm-cert-renew image: k8s.gcr.io/kubeadm:v1.22.0 command: ["kubeadm", "certs", "renew", "all"] restartPolicy: OnFailure
- 创建一个 CronJob,定期执行
5. 注意事项
- 证书续期影响:
- 续期证书后,需要重启相关组件,可能导致短暂的服务中断。
- 备份证书:
- 在续期或恢复证书前,务必备份现有证书,以防意外。
- 集群一致性:
- 在多 Master 集群中,确保所有 Master 节点的证书一致。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· .NET10 - 预览版1新功能体验(一)