#!/bin/bash # 灾备演练脚本 # 用法: ./dr-drill.sh # 自动化灾备演练:从备份恢复到测试数据库,验证数据完整性 set -u show_help() { cat <&2 exit 1 fi BACKUP_FILE="$2" shift 2 ;; --test-db) if [ $# -lt 2 ]; then echo "ERROR: --test-db requires an argument" >&2 exit 1 fi DR_DRILL_TEST_DB="$2" shift 2 ;; --no-cleanup) NO_CLEANUP=1 shift ;; --report-dir) if [ $# -lt 2 ]; then echo "ERROR: --report-dir requires an argument" >&2 exit 1 fi REPORT_DIR="$2" shift 2 ;; *) echo "ERROR: Unknown argument: $1" >&2 exit 1 ;; esac done # 配置 DATABASE_URL="${DATABASE_URL:-}" BACKUP_DIR="${BACKUP_DIR:-./backups}" TEST_DB="${DR_DRILL_TEST_DB:-next_edu_dr_drill}" REPORT_DIR="${REPORT_DIR:-${DR_DRILL_REPORT_DIR:-docs/dr/reports}}" TIMESTAMP=$(date +"%Y%m%d_%H%M%S") REPORT_FILE="$REPORT_DIR/dr_drill_${TIMESTAMP}.md" # 检查 DATABASE_URL if [ -z "$DATABASE_URL" ]; then echo "ERROR: DATABASE_URL not set" >&2 exit 1 fi # 解析 DATABASE_URL DB_USER=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p') DB_PASS=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p') DB_HOST=$(echo "$DATABASE_URL" | sed -n 's/.*@\([^:]*\):.*/\1/p') DB_PORT=$(echo "$DATABASE_URL" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p') DB_NAME=$(echo "$DATABASE_URL" | sed -n 's/.*\/\([^?]*\).*/\1/p') # 创建报告目录 mkdir -p "$REPORT_DIR" # 初始化报告 init_report() { cat > "$REPORT_FILE" <> "$REPORT_FILE" } # 记录步骤结果 step_result() { local step="$1" local status="$2" local detail="$3" append_report "### 步骤 $step: $status" append_report "" append_report "$detail" append_report "" if [ "$status" = "FAILED" ]; then append_report "❌ 步骤失败" else append_report "✅ 步骤成功" fi append_report "" echo "---" } echo "=== Disaster Recovery Drill ===" echo "Time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" echo "Test DB: $TEST_DB" echo "Source DB: $DB_NAME@$DB_HOST:$DB_PORT" echo "Report: $REPORT_FILE" echo "" init_report DRILL_START=$(date +%s) OVERALL_STATUS="SUCCESS" # 步骤 1: 查找备份文件 echo "[1/6] Locating backup file..." if [ -z "$BACKUP_FILE" ]; then BACKUP_FILE=$(ls -t "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | head -1) if [ -z "$BACKUP_FILE" ]; then echo " FAIL: No backup file found in $BACKUP_DIR" step_result "1 - 定位备份文件" "FAILED" "未找到备份文件于 $BACKUP_DIR" OVERALL_STATUS="FAILED" append_report "## 演练结果: ❌ FAILED" append_report "" append_report "演练失败,未找到备份文件" exit 1 fi fi if [ ! -f "$BACKUP_FILE" ]; then echo " FAIL: Backup file not found: $BACKUP_FILE" step_result "1 - 定位备份文件" "FAILED" "备份文件不存在: $BACKUP_FILE" OVERALL_STATUS="FAILED" append_report "## 演练结果: ❌ FAILED" exit 1 fi BACKUP_SIZE=$(stat -c%s "$BACKUP_FILE" 2>/dev/null || stat -f%z "$BACKUP_FILE" 2>/dev/null) echo " PASS: Found backup: $BACKUP_FILE (${BACKUP_SIZE} bytes)" step_result "1 - 定位备份文件" "PASSED" "备份文件: \`$BACKUP_FILE\` (${BACKUP_SIZE} bytes)" # 步骤 2: 创建测试数据库 echo "[2/6] Creating test database..." # 先删除已存在的测试数据库 mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "CREATE DATABASE \`$TEST_DB\` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;" 2>/dev/null; then echo " PASS: Test database created: $TEST_DB" step_result "2 - 创建测试数据库" "PASSED" "测试数据库 \`$TEST_DB\` 创建成功" else echo " FAIL: Could not create test database" step_result "2 - 创建测试数据库" "FAILED" "创建测试数据库 \`$TEST_DB\` 失败" OVERALL_STATUS="FAILED" append_report "## 演练结果: ❌ FAILED" exit 1 fi # 步骤 3: 从备份恢复到测试数据库 echo "[3/6] Restoring backup to test database..." RESTORE_START=$(date +%s) if gunzip -c "$BACKUP_FILE" | mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" 2>/dev/null; then RESTORE_END=$(date +%s) RESTORE_DURATION=$((RESTORE_END - RESTORE_START)) echo " PASS: Restore completed in ${RESTORE_DURATION}s" step_result "3 - 从备份恢复" "PASSED" "恢复完成,耗时 ${RESTORE_DURATION} 秒" else echo " FAIL: Restore failed" step_result "3 - 从备份恢复" "FAILED" "从备份恢复失败" OVERALL_STATUS="FAILED" # 尝试清理 if [ "$NO_CLEANUP" -eq 0 ]; then mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null || true fi append_report "## 演练结果: ❌ FAILED" exit 1 fi # 步骤 4: 数据完整性检查 echo "[4/6] Running data integrity checks..." # 获取测试数据库表数量 TEST_TABLES=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TEST_DB';" \ -s -N 2>/dev/null || echo 0) # 获取源数据库表数量 SOURCE_TABLES=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$DB_NAME';" \ -s -N 2>/dev/null || echo 0) echo " Test DB tables: $TEST_TABLES" echo " Source DB tables: $SOURCE_TABLES" # 获取测试数据库总记录数 TEST_RECORDS=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$TEST_DB';" \ -s -N 2>/dev/null || echo 0) # 获取源数据库总记录数 SOURCE_RECORDS=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$DB_NAME';" \ -s -N 2>/dev/null || echo 0) echo " Test DB records: $TEST_RECORDS" echo " Source DB records: $SOURCE_RECORDS" INTEGRITY_DETAIL="| 指标 | 测试库 | 源库 | |------|--------|------| | 表数量 | $TEST_TABLES | $SOURCE_TABLES | | 记录数(近似) | $TEST_RECORDS | $SOURCE_RECORDS |" if [ "$TEST_TABLES" -ge "$SOURCE_TABLES" ]; then echo " PASS: Table count matches" step_result "4 - 数据完整性检查" "PASSED" "$INTEGRITY_DETAIL" else echo " WARN: Test DB has fewer tables than source" step_result "4 - 数据完整性检查" "WARN" "$INTEGRITY_DETAIL ⚠️ 测试库表数量少于源库" fi # 步骤 5: 冒烟测试 echo "[5/6] Running smoke tests..." SMOKE_PASSED=0 SMOKE_FAILED=0 SMOKE_DETAIL="" # 测试 1: 检查 users 表(如果存在) USER_COUNT=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \ -e "SELECT COUNT(*) FROM users;" -s -N 2>/dev/null || echo "N/A") if [ "$USER_COUNT" != "N/A" ]; then SMOKE_PASSED=$((SMOKE_PASSED + 1)) SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ users 表查询成功: ${USER_COUNT} 条记录 " echo " PASS: users table query: $USER_COUNT records" else SMOKE_DETAIL="${SMOKE_DETAIL}- ⚠️ users 表不存在或查询失败 " echo " WARN: users table not found or query failed" fi # 测试 2: 检查 schools 表(如果存在) SCHOOL_COUNT=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \ -e "SELECT COUNT(*) FROM schools;" -s -N 2>/dev/null || echo "N/A") if [ "$SCHOOL_COUNT" != "N/A" ]; then SMOKE_PASSED=$((SMOKE_PASSED + 1)) SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ schools 表查询成功: ${SCHOOL_COUNT} 条记录 " echo " PASS: schools table query: $SCHOOL_COUNT records" else SMOKE_DETAIL="${SMOKE_DETAIL}- ⚠️ schools 表不存在或查询失败 " echo " WARN: schools table not found or query failed" fi # 测试 3: 执行简单 JOIN 查询(检查关系完整性) JOIN_TEST=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \ -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TEST_DB' AND table_type='BASE TABLE';" \ -s -N 2>/dev/null || echo "0") if [ "$JOIN_TEST" -gt 0 ]; then SMOKE_PASSED=$((SMOKE_PASSED + 1)) SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ 基础表查询成功: ${JOIN_TEST} 个基础表 " echo " PASS: Base table query: $JOIN_TEST tables" else SMOKE_DETAIL="${SMOKE_DETAIL}- ❌ 基础表查询失败 " SMOKE_FAILED=$((SMOKE_FAILED + 1)) echo " FAIL: Base table query failed" fi step_result "5 - 冒烟测试" "PASSED" "通过: $SMOKE_PASSED, 失败: $SMOKE_FAILED $SMOKE_DETAIL" # 步骤 6: 清理测试数据库 echo "[6/6] Cleaning up test database..." if [ "$NO_CLEANUP" -eq 1 ]; then echo " SKIP: Cleanup skipped (--no-cleanup)" step_result "6 - 清理测试数据库" "SKIPPED" "演练后保留测试数据库 \`$TEST_DB\`" else if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \ -e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null; then echo " PASS: Test database dropped: $TEST_DB" step_result "6 - 清理测试数据库" "PASSED" "测试数据库 \`$TEST_DB\` 已删除" else echo " WARN: Could not drop test database (manual cleanup required)" step_result "6 - 清理测试数据库" "WARN" "⚠️ 无法删除测试数据库 \`$TEST_DB\`,需手动清理" fi fi # 生成总结 DRILL_END=$(date +%s) DRILL_DURATION=$((DRILL_END - DRILL_START)) append_report "## 演练结果" append_report "" if [ "$OVERALL_STATUS" = "SUCCESS" ]; then append_report "**状态**: ✅ 成功" else append_report "**状态**: ❌ 失败" fi append_report "**总耗时**: ${DRILL_DURATION} 秒" append_report "**备份文件**: \`$BACKUP_FILE\`" append_report "**测试数据库**: \`$TEST_DB\`" append_report "" append_report "## RTO/RPO 评估" append_report "" append_report "- **RTO 目标**: 4 小时" append_report "- **本次恢复耗时**: ${RESTORE_DURATION} 秒 ($(( RESTORE_DURATION / 60 )) 分钟)" if [ -n "${RESTORE_DURATION:-}" ] && [ "$RESTORE_DURATION" -lt 14400 ]; then append_report "- **RTO 评估**: ✅ 达标" else append_report "- **RTO 评估**: ⚠️ 需关注" fi append_report "- **RPO 目标**: 24 小时(取决于备份频率)" append_report "" echo "" echo "=== Drill Summary ===" echo "Status: $OVERALL_STATUS" echo "Duration: ${DRILL_DURATION}s" echo "Report: $REPORT_FILE" echo "" if [ "$OVERALL_STATUS" = "SUCCESS" ]; then exit 0 else exit 1 fi