#!/bin/bash # 健康检查脚本 # 用法: ./health-check.sh # 检查应用、数据库、磁盘空间、备份新鲜度,输出 JSON 报告 set -u show_help() { cat <&2 exit 1 fi APP_URL="$2" shift 2 ;; --no-app) CHECK_APP=0; shift ;; --no-db) CHECK_DB=0; shift ;; --no-disk) CHECK_DISK=0; shift ;; --no-backup) CHECK_BACKUP=0; shift ;; --disk-threshold) if [ $# -lt 2 ]; then echo "ERROR: --disk-threshold requires an argument" >&2 exit 1 fi DISK_THRESHOLD="$2" shift 2 ;; --backup-max-age) if [ $# -lt 2 ]; then echo "ERROR: --backup-max-age requires an argument" >&2 exit 1 fi BACKUP_MAX_AGE="$2" shift 2 ;; *) echo "ERROR: Unknown argument: $1" >&2 exit 1 ;; esac done # 配置 APP_URL="${APP_URL:-${HEALTH_CHECK_URL:-http://localhost:8015}}" BACKUP_DIR="${BACKUP_DIR:-./backups}" DISK_THRESHOLD="${DISK_THRESHOLD:-${HEALTH_CHECK_DISK_THRESHOLD:-90}}" BACKUP_MAX_AGE="${BACKUP_MAX_AGE:-${HEALTH_CHECK_BACKUP_MAX_AGE:-24}}" DATABASE_URL="${DATABASE_URL:-}" # JSON 输出辅助函数 json_escape() { echo "$1" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/\t/\\t/g' } # 检查结果数组 RESULTS="" OVERALL_STATUS="healthy" CHECKS_PASSED=0 CHECKS_FAILED=0 CHECKS_WARNED=0 add_result() { local name="$1" local status="$2" local message="$3" local detail="${4:-}" local escaped_message escaped_detail escaped_message=$(json_escape "$message") escaped_detail=$(json_escape "$detail") local result_entry result_entry=" {\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$escaped_message\"" if [ -n "$detail" ]; then result_entry="$result_entry, \"detail\": \"$escaped_detail\"" fi result_entry="$result_entry }" if [ -z "$RESULTS" ]; then RESULTS="$result_entry" else RESULTS="$RESULTS, $result_entry" fi case "$status" in pass) CHECKS_PASSED=$((CHECKS_PASSED + 1)) ;; fail) CHECKS_FAILED=$((CHECKS_FAILED + 1)); OVERALL_STATUS="unhealthy" ;; warn) CHECKS_WARNED=$((CHECKS_WARNED + 1)); [ "$OVERALL_STATUS" = "healthy" ] && OVERALL_STATUS="degraded" ;; esac } # 1. 应用健康检查 if [ "$CHECK_APP" -eq 1 ]; then if command -v curl >/dev/null 2>&1; then HTTP_CODE=$(curl -sf -o /dev/null -w "%{http_code}" -m 10 "$APP_URL" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "302" ]; then add_result "app" "pass" "Application is healthy" "HTTP $HTTP_CODE from $APP_URL" elif [ "$HTTP_CODE" = "000" ]; then add_result "app" "fail" "Application is not reachable" "Cannot connect to $APP_URL" else add_result "app" "fail" "Application returned error" "HTTP $HTTP_CODE from $APP_URL" fi else add_result "app" "warn" "curl not available, skipping app check" "" fi fi # 2. 数据库连接检查 if [ "$CHECK_DB" -eq 1 ]; then if [ -z "$DATABASE_URL" ]; then add_result "database" "warn" "DATABASE_URL not set, skipping DB check" "" else # 解析 DATABASE_URL DB_USER=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p') DB_PASS=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p') DB_HOST=$(echo "$DATABASE_URL" | sed -n 's/.*@\([^:]*\):.*/\1/p') DB_PORT=$(echo "$DATABASE_URL" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p') DB_NAME=$(echo "$DATABASE_URL" | sed -n 's/.*\/\([^?]*\).*/\1/p') if command -v mysql >/dev/null 2>&1; then if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT 1;" 2>/dev/null; then # 获取连接信息 DB_VERSION=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT VERSION();" -s -N 2>/dev/null || echo "unknown") add_result "database" "pass" "Database connection successful" "Host: $DB_HOST:$DB_PORT, DB: $DB_NAME, Version: $DB_VERSION" else add_result "database" "fail" "Database connection failed" "Cannot connect to $DB_HOST:$DB_PORT/$DB_NAME" fi else add_result "database" "warn" "mysql client not available, skipping DB check" "" fi fi fi # 3. 磁盘空间检查 if [ "$CHECK_DISK" -eq 1 ]; then # 获取根分区或当前目录所在分区的使用率 DISK_INFO=$(df -h . 2>/dev/null | tail -1) if [ -n "$DISK_INFO" ]; then DISK_USE_PCT=$(echo "$DISK_INFO" | awk '{print $5}' | sed 's/%//') DISK_USE_HUMAN=$(echo "$DISK_INFO" | awk '{print $3}') DISK_TOTAL_HUMAN=$(echo "$DISK_INFO" | awk '{print $2}') DISK_AVAIL_HUMAN=$(echo "$DISK_INFO" | awk '{print $4}') DISK_MOUNT=$(echo "$DISK_INFO" | awk '{print $6}') if [ "$DISK_USE_PCT" -ge "$DISK_THRESHOLD" ]; then add_result "disk" "fail" "Disk space critical" "Usage: ${DISK_USE_PCT}% (threshold: ${DISK_THRESHOLD}%), Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}" elif [ "$DISK_USE_PCT" -ge $((DISK_THRESHOLD - 10)) ]; then add_result "disk" "warn" "Disk space warning" "Usage: ${DISK_USE_PCT}% (threshold: ${DISK_THRESHOLD}%), Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}" else add_result "disk" "pass" "Disk space OK" "Usage: ${DISK_USE_PCT}%, Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}" fi else add_result "disk" "warn" "Could not determine disk usage" "" fi fi # 4. 备份新鲜度检查 if [ "$CHECK_BACKUP" -eq 1 ]; then if [ -d "$BACKUP_DIR" ]; then LATEST_BACKUP=$(ls -t "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | head -1) if [ -n "$LATEST_BACKUP" ]; then # 获取备份文件修改时间(秒) BACKUP_MTIME=$(stat -c%Y "$LATEST_BACKUP" 2>/dev/null || stat -f%m "$LATEST_BACKUP" 2>/dev/null) CURRENT_TIME=$(date +%s) BACKUP_AGE_HOURS=$(( (CURRENT_TIME - BACKUP_MTIME) / 3600 )) BACKUP_SIZE=$(stat -c%s "$LATEST_BACKUP" 2>/dev/null || stat -f%z "$LATEST_BACKUP" 2>/dev/null) BACKUP_SIZE_HUMAN=$(echo "$BACKUP_SIZE" | awk '{split("B KB MB GB TB",v);i=1;while($1>=1024&&i<5){$1/=1024;i++};printf "%.1f%s",$1,v[i]}') if [ "$BACKUP_AGE_HOURS" -gt "$BACKUP_MAX_AGE" ]; then add_result "backup" "fail" "Backup is stale" "Latest backup is ${BACKUP_AGE_HOURS}h old (max: ${BACKUP_MAX_AGE}h), File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN" elif [ "$BACKUP_AGE_HOURS" -gt $((BACKUP_MAX_AGE / 2)) ]; then add_result "backup" "warn" "Backup getting old" "Latest backup is ${BACKUP_AGE_HOURS}h old (max: ${BACKUP_MAX_AGE}h), File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN" else add_result "backup" "pass" "Backup is fresh" "Latest backup is ${BACKUP_AGE_HOURS}h old, File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN" fi else add_result "backup" "fail" "No backup files found" "No db_backup_*.sql.gz files in $BACKUP_DIR" fi else add_result "backup" "warn" "Backup directory does not exist" "$BACKUP_DIR" fi fi # 输出 JSON 报告 TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") cat <