#!/bin/bash # 故障切换脚本 # 用法: ./failover.sh [--auto] [--primary URL] [--standby URL] # 用于主数据库故障时切换到备库 set -u show_help() { cat <&2 exit 1 fi PRIMARY_URL="$2" shift 2 ;; --standby) if [ $# -lt 2 ]; then echo "ERROR: --standby requires an argument" >&2 exit 1 fi STANDBY_URL="$2" shift 2 ;; --app-url) if [ $# -lt 2 ]; then echo "ERROR: --app-url requires an argument" >&2 exit 1 fi APP_URL="$2" shift 2 ;; --no-restart) NO_RESTART=1 shift ;; --dry-run) DRY_RUN=1 shift ;; *) echo "ERROR: Unknown argument: $1" >&2 exit 1 ;; esac done # 配置 PRIMARY_URL="${PRIMARY_URL:-${DATABASE_URL:-}}" STANDBY_URL="${STANDBY_URL:-${DATABASE_URL_STANDBY:-}}" APP_URL="${APP_URL:-${FAILOVER_APP_URL:-http://localhost:8015}}" APP_NAME="${FAILOVER_APP_NAME:-nextjs-app}" CONFIG_FILE="${FAILOVER_CONFIG_FILE:-.env.local}" LOG_DIR="docs/dr/logs" LOG_FILE="${FAILOVER_LOG_FILE:-$LOG_DIR/failover.log}" # 检查必需参数 if [ -z "$STANDBY_URL" ]; then echo "ERROR: Standby database URL not provided" >&2 echo "Set DATABASE_URL_STANDBY or use --standby" >&2 exit 1 fi if [ -z "$PRIMARY_URL" ]; then echo "ERROR: Primary database URL not provided" >&2 echo "Set DATABASE_URL or use --primary" >&2 exit 1 fi # 创建日志目录 mkdir -p "$LOG_DIR" # 日志函数 log() { local timestamp timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") echo "[$timestamp] $1" | tee -a "$LOG_FILE" } log_error() { log "ERROR: $1" >&2 } # 解析 DATABASE_URL parse_db_url() { local url="$1" local user pass host port dbname user=$(echo "$url" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p') pass=$(echo "$url" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p') host=$(echo "$url" | sed -n 's/.*@\([^:]*\):.*/\1/p') port=$(echo "$url" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p') dbname=$(echo "$url" | sed -n 's/.*\/\([^?]*\).*/\1/p') echo "$user|$pass|$host|$port|$dbname" } # 检查数据库健康 check_db_health() { local url="$1" local parsed parsed=$(parse_db_url "$url") local user pass host port dbname IFS='|' read -r user pass host port dbname </dev/null; then log " Database is healthy" return 0 else log " Database is NOT reachable" return 1 fi } # 检查应用健康 check_app_health() { local url="$1" log "Checking application health: $url" if [ "$DRY_RUN" -eq 1 ]; then log " [DRY-RUN] Would check: curl -f $url" return 0 fi if command -v curl >/dev/null 2>&1; then if curl -sf -o /dev/null -m 10 "$url" 2>/dev/null; then log " Application is healthy" return 0 else log " Application is NOT healthy" return 1 fi else log " WARN: curl not available, skipping app health check" return 0 fi } # 提升备库为主库(如果是主从架构) promote_standby() { log "Promoting standby to primary..." local parsed parsed=$(parse_db_url "$STANDBY_URL") local user pass host port dbname IFS='|' read -r user pass host port dbname </dev/null) if [ -n "$SLAVE_STATUS" ]; then log " Standby is a slave, promoting..." # 停止复制 if mysql -h "$host" -P "$port" -u "$user" -p"$pass" \ -e "STOP SLAVE; RESET SLAVE ALL;" 2>/dev/null; then log " Replication stopped and reset" else log_error "Failed to stop replication" return 1 fi # 关闭只读模式 if mysql -h "$host" -P "$port" -u "$user" -p"$pass" \ -e "SET GLOBAL read_only=OFF; SET GLOBAL super_read_only=OFF;" 2>/dev/null; then log " Read-only mode disabled" else log_error "Failed to disable read-only mode" return 1 fi else log " Standby is not a slave (standalone), skipping promotion" fi log " Standby promoted successfully" return 0 } # 更新应用配置 update_config() { log "Updating application configuration..." if [ "$DRY_RUN" -eq 1 ]; then log " [DRY-RUN] Would update $CONFIG_FILE: DATABASE_URL=$STANDBY_URL" return 0 fi if [ -f "$CONFIG_FILE" ]; then # 备份原配置 cp "$CONFIG_FILE" "${CONFIG_FILE}.bak.$(date +%s)" log " Backed up original config to ${CONFIG_FILE}.bak.*" # 更新 DATABASE_URL if grep -q "^DATABASE_URL=" "$CONFIG_FILE"; then sed -i.bak "s|^DATABASE_URL=.*|DATABASE_URL=$STANDBY_URL|" "$CONFIG_FILE" rm -f "${CONFIG_FILE}.bak" 2>/dev/null || true log " Updated DATABASE_URL in $CONFIG_FILE" else echo "DATABASE_URL=$STANDBY_URL" >> "$CONFIG_FILE" log " Added DATABASE_URL to $CONFIG_FILE" fi else log " WARN: Config file $CONFIG_FILE not found, creating new one" echo "DATABASE_URL=$STANDBY_URL" > "$CONFIG_FILE" fi # 同时更新环境变量(供当前会话使用) export DATABASE_URL="$STANDBY_URL" log " Configuration updated" return 0 } # 重启应用 restart_app() { if [ "$NO_RESTART" -eq 1 ]; then log "Skipping application restart (--no-restart)" return 0 fi log "Restarting application..." if [ "$DRY_RUN" -eq 1 ]; then log " [DRY-RUN] Would restart: docker restart $APP_NAME" return 0 fi if command -v docker >/dev/null 2>&1; then log " Restarting Docker container: $APP_NAME" if docker restart "$APP_NAME" 2>/dev/null; then log " Container restarted" # 等待应用启动 log " Waiting for application to start..." sleep 5 return 0 else log_error "Failed to restart container $APP_NAME" return 1 fi else log " WARN: Docker not available, please restart application manually" log " Updated DATABASE_URL: $STANDBY_URL" fi return 0 } # 主流程 log "========================================" log "Database Failover Started" log "========================================" log "Mode: $([ "$AUTO_MODE" -eq 1 ] && echo "semi-auto" || echo "manual")" log "Dry-run: $([ "$DRY_RUN" -eq 1 ] && echo "yes" || echo "no")" log "Primary: $PRIMARY_URL" log "Standby: $STANDBY_URL" log "" # 步骤 1: 检测主库健康状态 log "[1/5] Checking primary database health..." PRIMARY_HEALTHY=0 if check_db_health "$PRIMARY_URL"; then PRIMARY_HEALTHY=1 log " Primary is healthy" if [ "$AUTO_MODE" -eq 0 ]; then log " Primary is healthy. Failover not needed." log " Use --auto to force failover even if primary is healthy" log "========================================" log "Failover Cancelled (Primary Healthy)" log "========================================" exit 0 fi else log " Primary is NOT healthy, proceeding with failover" fi # 半自动模式确认 if [ "$AUTO_MODE" -eq 1 ] && [ "$DRY_RUN" -eq 0 ]; then echo "" echo "WARNING: About to failover from primary to standby." echo " Primary: $PRIMARY_URL" echo " Standby: $STANDBY_URL" echo "" read -p "Type 'FAILover' to confirm: " CONFIRM if [ "$CONFIRM" != "FAILover" ]; then log "Failover cancelled by user" exit 1 fi fi # 步骤 2: 检查备库健康 log "" log "[2/5] Checking standby database health..." if ! check_db_health "$STANDBY_URL"; then log_error "Standby is also not healthy, cannot failover" log "========================================" log "Failover FAILED (Standby Unhealthy)" log "========================================" exit 1 fi # 步骤 3: 提升备库为主库 log "" log "[3/5] Promoting standby to primary..." if ! promote_standby; then log_error "Failed to promote standby" exit 1 fi # 步骤 4: 更新应用配置并重启 log "" log "[4/5] Updating application configuration and restarting..." update_config if ! restart_app; then log_error "Failed to restart application" log " Manual intervention required" exit 1 fi # 步骤 5: 验证切换成功 log "" log "[5/5] Verifying failover..." sleep 3 # 检查应用健康 APP_HEALTHY=0 for i in 1 2 3 4 5; do if check_app_health "$APP_URL"; then APP_HEALTHY=1 break fi log " Retry $i/5 in 5 seconds..." sleep 5 done if [ "$APP_HEALTHY" -eq 0 ]; then log_error "Application is not healthy after failover" log " Check application logs and configuration" log "========================================" log "Failover FAILED (App Unhealthy)" log "========================================" exit 1 fi # 检查数据库连接(通过应用) log " Verifying database connection via application..." if [ "$DRY_RUN" -eq 0 ]; then if curl -sf -m 10 "$APP_URL" >/dev/null 2>&1; then log " Application responding successfully" else log_error "Application not responding" exit 1 fi fi log "" log "========================================" log "Failover Completed Successfully" log "========================================" log "Primary (old): $PRIMARY_URL" log "Standby (new): $STANDBY_URL" log "Application: $APP_URL" log "Log file: $LOG_FILE" log "" log "Post-failover checklist:" log " 1. Verify application functionality" log " 2. Update monitoring alerts" log " 3. Notify stakeholders" log " 4. Plan primary database recovery" log " 5. Schedule post-mortem review" log "" exit 0