feat(P2): 实现质量保障类5项功能(无障碍/视觉回归/通知渠道/漏洞扫描/灾备)

## 新增功能

### 1. 屏幕阅读器兼容性增强(a11y)
- 无障碍工具库:src/shared/lib/a11y.ts
- aria-live Hook:src/shared/hooks/use-aria-live.ts
- a11y 组件:skip-link/visually-hidden/focus-trap/aria-status
- 增强 UI:table.tsx 系统性 ARIA role,dialog.tsx aria-modal
- 审计文档:docs/accessibility/a11y-audit.md(WCAG 2.1 AA 清单)

### 2. 视觉回归测试
- 测试套件:tests/visual/(homepage + 3 个 dashboard)
- 3 视口(desktop/tablet/mobile)× 2 主题(light/dark)
- 动态元素遮罩,避免误报
- playwright.config.ts 新增 visual-chromium 项目
- 文档:docs/testing/visual-regression.md

### 3. 短信/微信推送渠道集成
- 新模块:src/modules/notifications/
- 4 个渠道:SMS(阿里云/腾讯云)、WeChat(公众号)、Email(SMTP)、In-App
- 分发器按用户偏好并行多渠道发送
- 外部 SDK 动态 import,Mock 模式开发可用
- 文档:docs/notifications/channels.md

### 4. 漏洞扫描 CI 集成
- CI security-scan job:npm audit + Snyk + Trivy FS + OWASP ZAP
- 独立工作流 security.yml:每周一深度扫描 + 容器镜像扫描
- 配置:suppressions.json + .trivyignore
- 本地脚本:security-scan.sh/ps1
- 文档:docs/security/scanning.md(SLA 分级)

### 5. 灾备方案
- 脚本:backup-verify/backup-offsite-sync/dr-drill/failover/health-check
- CI 增强:备份后校验+异地同步,每周灾备演练
- 独立工作流 dr-drill.yml:每周一凌晨 4 点自动演练
- 文档:docs/dr/dr-plan.md(RTO 4h/RPO 24h)+ dr-runbook.md(6 故障场景)

## 验证
- npx tsc --noEmit:0 错误
- npm run lint:0 错误 0 警告
This commit is contained in:
SpecialX
2026-06-17 20:18:29 +08:00
parent b86255f0ea
commit 6585e10c6f
53 changed files with 7491 additions and 37 deletions

View File

@@ -0,0 +1,342 @@
#!/bin/bash
# 异地备份同步脚本
# 用法: ./backup-offsite-sync.sh
# 将本地备份同步到远程存储(S3/OSS/NFS),支持校验和清理
set -u
show_help() {
cat <<EOF
用法: $0 [选项]
异地备份同步脚本,将本地备份同步到远程存储
选项:
--backend TYPE 远程存储后端类型: s3|oss|nfs|none
--no-cleanup 不清理远程过期备份
--no-verify 不校验同步结果
--help, -h 显示帮助信息
环境变量:
BACKUP_DIR 本地备份目录(默认 ./backups)
BACKUP_OFFSITE_BACKEND 远程后端类型: s3|oss|nfs|none (默认 none)
BACKUP_OFFSITE_REMOTE 远程目标路径
- s3: s3://bucket-name/path
- oss: oss://bucket-name/path
- nfs: /mnt/nfs/backup-path
BACKUP_OFFSITE_BUCKET 存储桶名称(仅 s3/oss)
BACKUP_OFFSITE_ACCESS_KEY 访问密钥
BACKUP_OFFSITE_SECRET_KEY 秘密密钥
BACKUP_OFFSITE_REGION 区域(默认 us-east-1)
BACKUP_OFFSITE_RETENTION_DAYS 远程保留天数(默认 90)
需要工具:
s3: aws-cli (aws) 或 rclone
oss: ossutil 或 rclone
nfs: rsync (NFS 应已挂载到 BACKUP_OFFSITE_REMOTE)
退出码:
0 同步成功
1 同步失败
EOF
}
# 解析参数
NO_CLEANUP=0
NO_VERIFY=0
while [ $# -gt 0 ]; do
case "$1" in
--help|-h)
show_help
exit 0
;;
--backend)
if [ $# -lt 2 ]; then
echo "ERROR: --backend requires an argument" >&2
exit 1
fi
BACKUP_OFFSITE_BACKEND="$2"
shift 2
;;
--no-cleanup)
NO_CLEANUP=1
shift
;;
--no-verify)
NO_VERIFY=1
shift
;;
*)
echo "ERROR: Unknown argument: $1" >&2
exit 1
;;
esac
done
BACKUP_DIR="${BACKUP_DIR:-./backups}"
BACKEND="${BACKUP_OFFSITE_BACKEND:-none}"
REMOTE="${BACKUP_OFFSITE_REMOTE:-}"
BUCKET="${BACKUP_OFFSITE_BUCKET:-}"
ACCESS_KEY="${BACKUP_OFFSITE_ACCESS_KEY:-}"
SECRET_KEY="${BACKUP_OFFSITE_SECRET_KEY:-}"
REGION="${BACKUP_OFFSITE_REGION:-us-east-1}"
RETENTION_DAYS="${BACKUP_OFFSITE_RETENTION_DAYS:-90}"
echo "=== Offsite Backup Sync ==="
echo "Time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
echo "Backend: $BACKEND"
echo "Local: $BACKUP_DIR"
echo "Remote: $REMOTE"
echo ""
# 检查后端类型
if [ "$BACKEND" = "none" ]; then
echo "INFO: BACKUP_OFFSITE_BACKEND=none, offsite sync disabled"
echo "To enable, set BACKUP_OFFSITE_BACKEND to s3, oss, or nfs"
exit 0
fi
if [ "$BACKEND" != "s3" ] && [ "$BACKEND" != "oss" ] && [ "$BACKEND" != "nfs" ]; then
echo "ERROR: Invalid backend: $BACKEND (must be s3, oss, nfs, or none)" >&2
exit 1
fi
# 检查本地备份目录
if [ ! -d "$BACKUP_DIR" ]; then
echo "ERROR: Local backup directory does not exist: $BACKUP_DIR" >&2
exit 1
fi
# 统计本地备份文件
LOCAL_FILES=$(ls -1 "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | wc -l)
if [ "$LOCAL_FILES" -eq 0 ]; then
echo "ERROR: No backup files found in $BACKUP_DIR" >&2
exit 1
fi
echo "INFO: Found $LOCAL_FILES local backup files"
# 检查远程配置
if [ -z "$REMOTE" ]; then
echo "ERROR: BACKUP_OFFSITE_REMOTE not set" >&2
echo "Example for $BACKEND:" >&2
case "$BACKEND" in
s3) echo " s3://my-bucket/backups/" >&2 ;;
oss) echo " oss://my-bucket/backups/" >&2 ;;
nfs) echo " /mnt/nfs/backups/" >&2 ;;
esac
exit 1
fi
# 检查工具可用性
check_tool() {
if ! command -v "$1" >/dev/null 2>&1; then
echo "ERROR: Required tool not found: $1" >&2
echo "Please install $1 to use the $BACKEND backend" >&2
exit 1
fi
}
# 配置凭证
setup_credentials() {
case "$BACKEND" in
s3)
if [ -n "$ACCESS_KEY" ] && [ -n "$SECRET_KEY" ]; then
export AWS_ACCESS_KEY_ID="$ACCESS_KEY"
export AWS_SECRET_ACCESS_KEY="$SECRET_KEY"
export AWS_DEFAULT_REGION="$REGION"
fi
;;
oss)
if [ -n "$ACCESS_KEY" ] && [ -n "$SECRET_KEY" ]; then
# ossutil 配置
if [ -f ~/.ossutilconfig ]; then
cp ~/.ossutilconfig ~/.ossutilconfig.bak 2>/dev/null || true
fi
cat > ~/.ossutilconfig <<EOF
[Credentials]
provider = oss
accessKey = $ACCESS_KEY
secretKey = $SECRET_KEY
[Default]
endpoint = oss-${REGION}.aliyuncs.com
EOF
fi
;;
nfs)
# NFS 应已挂载,无需凭证
if [ ! -d "$REMOTE" ]; then
echo "ERROR: NFS remote directory does not exist: $REMOTE" >&2
echo "Please ensure NFS is mounted at this path" >&2
exit 1
fi
;;
esac
}
# 同步到远程
sync_to_remote() {
echo ""
echo "[1/3] Syncing backups to $BACKEND..."
case "$BACKEND" in
s3)
if command -v aws >/dev/null 2>&1; then
echo "Using aws-cli"
if ! aws s3 sync "$BACKUP_DIR/" "$REMOTE" \
--exclude "*" --include "db_backup_*.sql.gz" \
--no-progress; then
echo "ERROR: aws s3 sync failed" >&2
return 1
fi
elif command -v rclone >/dev/null 2>&1; then
echo "Using rclone"
if ! rclone sync "$BACKUP_DIR" "$REMOTE" \
--include "db_backup_*.sql.gz" \
--progress; then
echo "ERROR: rclone sync failed" >&2
return 1
fi
else
echo "ERROR: Neither aws-cli nor rclone found" >&2
return 1
fi
;;
oss)
if command -v ossutil >/dev/null 2>&1; then
echo "Using ossutil"
if ! ossutil cp -r "$BACKUP_DIR/" "$REMOTE" \
--include "db_backup_*.sql.gz" -f; then
echo "ERROR: ossutil sync failed" >&2
return 1
fi
elif command -v rclone >/dev/null 2>&1; then
echo "Using rclone"
if ! rclone sync "$BACKUP_DIR" "$REMOTE" \
--include "db_backup_*.sql.gz" \
--progress; then
echo "ERROR: rclone sync failed" >&2
return 1
fi
else
echo "ERROR: Neither ossutil nor rclone found" >&2
return 1
fi
;;
nfs)
if command -v rsync >/dev/null 2>&1; then
echo "Using rsync"
mkdir -p "$REMOTE" 2>/dev/null || true
if ! rsync -av --include="db_backup_*.sql.gz" --exclude="*" \
"$BACKUP_DIR/" "$REMOTE/"; then
echo "ERROR: rsync failed" >&2
return 1
fi
else
echo "Using cp (rsync not available)"
mkdir -p "$REMOTE" 2>/dev/null || true
if ! cp "$BACKUP_DIR"/db_backup_*.sql.gz "$REMOTE/" 2>/dev/null; then
echo "ERROR: cp failed" >&2
return 1
fi
fi
;;
esac
echo " PASS: Sync completed"
return 0
}
# 校验同步结果
verify_sync() {
if [ "$NO_VERIFY" -eq 1 ]; then
echo ""
echo "[2/3] Verification skipped (--no-verify)"
return 0
fi
echo ""
echo "[2/3] Verifying sync result..."
REMOTE_FILES=0
case "$BACKEND" in
s3)
if command -v aws >/dev/null 2>&1; then
REMOTE_FILES=$(aws s3 ls "$REMOTE" --recursive 2>/dev/null | grep -c "db_backup_.*\.sql\.gz" || echo 0)
elif command -v rclone >/dev/null 2>&1; then
REMOTE_FILES=$(rclone lsf "$REMOTE" --include "db_backup_*.sql.gz" 2>/dev/null | wc -l || echo 0)
fi
;;
oss)
if command -v ossutil >/dev/null 2>&1; then
REMOTE_FILES=$(ossutil ls "$REMOTE" 2>/dev/null | grep -c "db_backup_.*\.sql\.gz" || echo 0)
elif command -v rclone >/dev/null 2>&1; then
REMOTE_FILES=$(rclone lsf "$REMOTE" --include "db_backup_*.sql.gz" 2>/dev/null | wc -l || echo 0)
fi
;;
nfs)
REMOTE_FILES=$(ls -1 "$REMOTE"/db_backup_*.sql.gz 2>/dev/null | wc -l)
;;
esac
echo " Local files: $LOCAL_FILES"
echo " Remote files: $REMOTE_FILES"
if [ "$REMOTE_FILES" -lt "$LOCAL_FILES" ]; then
echo " WARN: Remote has fewer files than local (some may have been cleaned up)"
else
echo " PASS: File count verified"
fi
return 0
}
# 清理远程过期备份
cleanup_remote() {
if [ "$NO_CLEANUP" -eq 1 ]; then
echo ""
echo "[3/3] Cleanup skipped (--no-cleanup)"
return 0
fi
echo ""
echo "[3/3] Cleaning up remote backups older than $RETENTION_DAYS days..."
case "$BACKEND" in
s3)
if command -v aws >/dev/null 2>&1; then
aws s3 ls "$REMOTE" --recursive 2>/dev/null | grep "db_backup_.*\.sql\.gz" | while read -r line; do
FILE_PATH=$(echo "$line" | awk '{print $4}')
FILE_DATE=$(echo "$FILE_PATH" | grep -oE '[0-9]{8}_[0-9]{6}' | head -1)
if [ -n "$FILE_DATE" ]; then
FILE_TS=$(echo "$FILE_DATE" | sed 's/\([0-9]\{8\}\)_\([0-9]\{6\}\)/\1 \2/' | awk '{print $1}')
CUTOFF=$(date -d "-$RETENTION_DAYS days" +%Y%m%d 2>/dev/null || date -v-${RETENTION_DAYS}d +%Y%m%d 2>/dev/null)
if [ -n "$CUTOFF" ] && [ "$FILE_TS" -lt "$CUTOFF" ]; then
echo " Deleting: $FILE_PATH"
aws s3 rm "s3://$(echo "$REMOTE" | sed 's|s3://||')/$FILE_PATH" 2>/dev/null || true
fi
fi
done
fi
;;
oss)
if command -v ossutil >/dev/null 2>&1; then
# ossutil 不支持基于时间的清理,使用生命周期规则或手动删除
echo " INFO: For OSS, configure lifecycle rules in the console for automatic cleanup"
echo " INFO: Manual cleanup with retention $RETENTION_DAYS days"
fi
;;
nfs)
if [ -d "$REMOTE" ]; then
find "$REMOTE" -name "db_backup_*.sql.gz" -mtime +$RETENTION_DAYS -delete 2>/dev/null || true
echo " Cleaned up files older than $RETENTION_DAYS days"
fi
;;
esac
echo " PASS: Cleanup completed"
return 0
}
# 执行同步流程
setup_credentials
if ! sync_to_remote; then
exit 1
fi
verify_sync
cleanup_remote
echo ""
echo "=== Offsite Sync Complete ==="
exit 0

221
scripts/backup-verify.sh Normal file
View File

@@ -0,0 +1,221 @@
#!/bin/bash
# 备份完整性校验脚本
# 用法: ./backup-verify.sh [backup_file] [--min-size BYTES]
# 不传参数时校验最新备份
set -u
show_help() {
cat <<EOF
用法: $0 [backup_file] [选项]
备份完整性校验脚本
参数:
backup_file 要校验的备份文件路径(不传时校验最新备份)
选项:
--min-size BYTES 最小文件大小阈值(字节),默认 1024
--no-sql-check 跳过 SQL 语法校验(不连接数据库)
--help, -h 显示帮助信息
环境变量:
BACKUP_DIR 备份目录(默认 ./backups)
DATABASE_URL 数据库连接 URL(用于 SQL 语法校验)
BACKUP_VERIFY_MIN_SIZE 最小文件大小(字节,默认 1024)
退出码:
0 校验通过
1 校验失败
EOF
}
# 解析参数
BACKUP_FILE=""
MIN_SIZE="${BACKUP_VERIFY_MIN_SIZE:-1024}"
NO_SQL_CHECK=0
while [ $# -gt 0 ]; do
case "$1" in
--help|-h)
show_help
exit 0
;;
--min-size)
if [ $# -lt 2 ]; then
echo "ERROR: --min-size requires an argument" >&2
exit 1
fi
MIN_SIZE="$2"
shift 2
;;
--no-sql-check)
NO_SQL_CHECK=1
shift
;;
*)
if [ -z "$BACKUP_FILE" ]; then
BACKUP_FILE="$1"
else
echo "ERROR: Unknown argument: $1" >&2
exit 1
fi
shift
;;
esac
done
BACKUP_DIR="${BACKUP_DIR:-./backups}"
# 如果未指定文件,查找最新备份
if [ -z "$BACKUP_FILE" ]; then
BACKUP_FILE=$(ls -t "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | head -1)
if [ -z "$BACKUP_FILE" ]; then
echo "ERROR: No backup file found in $BACKUP_DIR" >&2
echo "Hint: Run scripts/backup-db.sh first or specify a file path" >&2
exit 1
fi
fi
echo "=== Backup Verification Report ==="
echo "File: $BACKUP_FILE"
echo "Time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
echo ""
ERRORS=0
WARNINGS=0
# 步骤 1: 检查文件存在
echo "[1/4] Checking file existence..."
if [ ! -f "$BACKUP_FILE" ]; then
echo " FAIL: File does not exist: $BACKUP_FILE"
exit 1
fi
echo " PASS: File exists"
# 步骤 2: 检查文件大小
echo "[2/4] Checking file size..."
FILE_SIZE=$(stat -c%s "$BACKUP_FILE" 2>/dev/null || stat -f%z "$BACKUP_FILE" 2>/dev/null)
if [ -z "$FILE_SIZE" ]; then
echo " WARN: Could not determine file size"
WARNINGS=$((WARNINGS + 1))
elif [ "$FILE_SIZE" -lt "$MIN_SIZE" ]; then
echo " FAIL: File size ${FILE_SIZE} bytes is below threshold ${MIN_SIZE} bytes"
echo " This may indicate a corrupted or empty backup"
exit 1
else
echo " PASS: File size ${FILE_SIZE} bytes (threshold: ${MIN_SIZE} bytes)"
fi
# 步骤 3: 校验 gzip 完整性
echo "[3/4] Verifying gzip integrity..."
if ! gunzip -t "$BACKUP_FILE" 2>/dev/null; then
echo " FAIL: gzip integrity check failed - file may be corrupted"
exit 1
fi
echo " PASS: gzip integrity verified"
# 步骤 4: 校验 SQL 内容
echo "[4/4] Verifying SQL content..."
TEMP_SQL=$(mktemp 2>/dev/null || echo "/tmp/backup_verify_$$.sql")
trap "rm -f \"$TEMP_SQL\" /tmp/backup_verify_errors_$$.txt 2>/dev/null" EXIT
if ! gunzip -c "$BACKUP_FILE" > "$TEMP_SQL" 2>/dev/null; then
echo " FAIL: Could not decompress backup file"
exit 1
fi
# 检查文件非空
SQL_SIZE=$(stat -c%s "$TEMP_SQL" 2>/dev/null || stat -f%z "$TEMP_SQL" 2>/dev/null)
if [ -z "$SQL_SIZE" ] || [ "$SQL_SIZE" -eq 0 ]; then
echo " FAIL: Decompressed SQL file is empty"
exit 1
fi
echo " PASS: Decompressed size: ${SQL_SIZE} bytes"
# 检查 mysqldump 头部
if grep -q "MySQL dump" "$TEMP_SQL" 2>/dev/null || grep -q "mysqldump" "$TEMP_SQL" 2>/dev/null; then
echo " PASS: mysqldump header found"
else
echo " WARN: mysqldump header not found (may not be a standard mysqldump file)"
WARNINGS=$((WARNINGS + 1))
fi
# 检查 SQL 语句数量
STMT_COUNT=$(grep -c ";" "$TEMP_SQL" 2>/dev/null || echo 0)
if [ "$STMT_COUNT" -lt 10 ]; then
echo " WARN: Low statement count (${STMT_COUNT} semicolons)"
WARNINGS=$((WARNINGS + 1))
else
echo " PASS: Found ${STMT_COUNT} SQL statements"
fi
# 检查 CREATE TABLE 数量
CREATE_COUNT=$(grep -ci "CREATE TABLE" "$TEMP_SQL" 2>/dev/null || echo 0)
echo " INFO: Found ${CREATE_COUNT} CREATE TABLE statements"
# 检查明显的语法错误标记
if grep -qi "ERROR at line" "$TEMP_SQL" 2>/dev/null; then
echo " FAIL: Found error markers in SQL file"
exit 1
fi
# SQL 语法校验(可选,需要 DATABASE_URL)
if [ "$NO_SQL_CHECK" -eq 1 ]; then
echo " SKIP: SQL syntax check skipped (--no-sql-check)"
elif [ -z "${DATABASE_URL:-}" ]; then
echo " SKIP: DATABASE_URL not set, skipping SQL syntax check"
else
echo " Performing SQL syntax check via mysql..."
# 解析 DATABASE_URL
DB_USER=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p')
DB_PASS=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p')
DB_HOST=$(echo "$DATABASE_URL" | sed -n 's/.*@\([^:]*\):.*/\1/p')
DB_PORT=$(echo "$DATABASE_URL" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
if [ -z "$DB_HOST" ] || [ -z "$DB_USER" ]; then
echo " WARN: Could not parse DATABASE_URL, skipping SQL syntax check"
WARNINGS=$((WARNINGS + 1))
else
# 创建临时数据库进行语法校验(不影响生产数据)
TEMP_DB="verify_$(date +%s)_$$"
ERROR_FILE="/tmp/backup_verify_errors_$$.txt"
if mysql -h "$DB_HOST" -P "${DB_PORT:-3306}" -u "$DB_USER" -p"$DB_PASS" \
-e "CREATE DATABASE \`$TEMP_DB\`;" 2>"$ERROR_FILE"; then
# 使用 --force 继续执行,捕获所有语法错误
mysql -h "$DB_HOST" -P "${DB_PORT:-3306}" -u "$DB_USER" -p"$DB_PASS" \
--force "$TEMP_DB" < "$TEMP_SQL" > /dev/null 2>"$ERROR_FILE" || true
# 检查是否有语法错误(区分语法错误和执行错误)
SYNTAX_ERRORS=$(grep -i "You have an error in your SQL syntax" "$ERROR_FILE" 2>/dev/null | wc -l || echo 0)
if [ "$SYNTAX_ERRORS" -gt 0 ]; then
echo " FAIL: Found $SYNTAX_ERRORS SQL syntax errors"
grep -i "You have an error in your SQL syntax" "$ERROR_FILE" | head -3
# 清理临时数据库
mysql -h "$DB_HOST" -P "${DB_PORT:-3306}" -u "$DB_USER" -p"$DB_PASS" \
-e "DROP DATABASE IF EXISTS \`$TEMP_DB\`;" 2>/dev/null || true
exit 1
else
echo " PASS: SQL syntax check passed (no syntax errors)"
fi
# 清理临时数据库
mysql -h "$DB_HOST" -P "${DB_PORT:-3306}" -u "$DB_USER" -p"$DB_PASS" \
-e "DROP DATABASE IF EXISTS \`$TEMP_DB\`;" 2>/dev/null || true
else
echo " WARN: Could not create temp database for syntax check, skipping"
WARNINGS=$((WARNINGS + 1))
fi
fi
fi
echo ""
echo "=== Verification Summary ==="
echo "Errors: $ERRORS"
echo "Warnings: $WARNINGS"
if [ "$ERRORS" -gt 0 ]; then
echo "Result: FAILED"
exit 1
fi
echo "Result: PASSED"
exit 0

420
scripts/dr-drill.ps1 Normal file
View File

@@ -0,0 +1,420 @@
<#
.SYNOPSIS
灾备演练脚本(Windows PowerShell 版本)
.DESCRIPTION
自动化灾备演练:从备份恢复到测试数据库,验证数据完整性
.EXAMPLE
.\dr-drill.ps1
.EXAMPLE
.\dr-drill.ps1 -BackupFile "backups\db_backup_20260617_020000.sql.gz" -TestDb "next_edu_dr_drill"
.EXAMPLE
.\dr-drill.ps1 -NoCleanup
.PARAMETER BackupFile
指定备份文件(不指定则使用最新备份)
.PARAMETER TestDb
测试数据库名(默认 next_edu_dr_drill)
.PARAMETER NoCleanup
演练后不清理测试数据库
.PARAMETER ReportDir
报告输出目录(默认 docs\dr\reports)
.PARAMETER Help
显示帮助信息
#>
[CmdletBinding()]
param(
[Parameter(Position = 0)]
[string]$BackupFile = "",
[Parameter()]
[string]$TestDb = "",
[Parameter()]
[switch]$NoCleanup,
[Parameter()]
[string]$ReportDir = "",
[Parameter()]
[switch]$Help
)
# 显示帮助
if ($Help) {
Get-Help $MyInvocation.MyCommand.Path -Detailed
exit 0
}
# 配置
$ErrorActionPreference = "Stop"
$DatabaseUrl = $env:DATABASE_URL
if ([string]::IsNullOrEmpty($DatabaseUrl)) {
Write-Host "ERROR: DATABASE_URL not set" -ForegroundColor Red
exit 1
}
$BackupDir = if ($env:BACKUP_DIR) { $env:BACKUP_DIR } else { ".\backups" }
if ([string]::IsNullOrEmpty($TestDb)) {
$TestDb = if ($env:DR_DRILL_TEST_DB) { $env:DR_DRILL_TEST_DB } else { "next_edu_dr_drill" }
}
if ([string]::IsNullOrEmpty($ReportDir)) {
$ReportDir = if ($env:DR_DRILL_REPORT_DIR) { $env:DR_DRILL_REPORT_DIR } else { "docs\dr\reports" }
}
$Timestamp = Get-Date -Format "yyyyMMdd_HHmmss"
$ReportFile = Join-Path $ReportDir "dr_drill_${Timestamp}.md"
# 解析 DATABASE_URL
# 格式: mysql://user:password@host:port/dbname
function Parse-DatabaseUrl {
param([string]$Url)
try {
$uri = [System.Uri]$Url
$userInfo = $uri.UserInfo -split ':', 2
return @{
User = $userInfo[0]
Pass = if ($userInfo.Length -gt 1) { $userInfo[1] } else { "" }
Host = $uri.Host
Port = if ($uri.Port -gt 0) { $uri.Port } else { 3306 }
DbName = $uri.AbsolutePath.TrimStart('/')
}
}
catch {
Write-Host "ERROR: Invalid DATABASE_URL format" -ForegroundColor Red
exit 1
}
}
$db = Parse-DatabaseUrl $DatabaseUrl
# 创建报告目录
if (-not (Test-Path $ReportDir)) {
New-Item -ItemType Directory -Path $ReportDir -Force | Out-Null
}
# 初始化报告
function Init-Report {
$content = @"
#
- ****: $(Get-Date -Format "yyyy-MM-ddTHH:mm:ssZ")
- ****: $TestDb
- ****: $($db.DbName)
- ****: $($db.Host):$($db.Port)
- ****: $BackupFile
##
"@
Set-Content -Path $ReportFile -Value $content -Encoding UTF8
}
function Append-Report {
param([string]$Content)
Add-Content -Path $ReportFile -Value $Content -Encoding UTF8
}
function Step-Result {
param(
[string]$Step,
[string]$Status,
[string]$Detail
)
Append-Report "### 步骤 $Step`: $Status"
Append-Report ""
Append-Report $Detail
Append-Report ""
if ($Status -eq "FAILED") {
Append-Report "❌ 步骤失败"
}
else {
Append-Report "✅ 步骤成功"
}
Append-Report ""
Write-Host "---"
}
# MySQL 执行函数
function Invoke-MySql {
param(
[string]$Query,
[string]$Database = "",
[switch]$Silent,
[switch]$Scalar
)
$mysqlArgs = @("-h", $db.Host, "-P", $db.Port, "-u", $db.User, "-p$($db.Pass)")
if (-not [string]::IsNullOrEmpty($Database)) {
$mysqlArgs += $Database
}
$mysqlArgs += @("-e", $Query)
if ($Scalar) {
$mysqlArgs += @("-s", "-N")
}
if ($Silent) {
$result = & mysql @mysqlArgs 2>$null
}
else {
$result = & mysql @mysqlArgs 2>&1
}
return $result
}
# 检查 mysql 命令
if (-not (Get-Command mysql -ErrorAction SilentlyContinue)) {
Write-Host "ERROR: mysql client not found in PATH" -ForegroundColor Red
Write-Host "Please install MySQL client tools" -ForegroundColor Red
exit 1
}
Write-Host "=== Disaster Recovery Drill ===" -ForegroundColor Cyan
Write-Host "Time: $(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssZ')"
Write-Host "Test DB: $TestDb"
Write-Host "Source DB: $($db.DbName)@$($db.Host):$($db.Port)"
Write-Host "Report: $ReportFile"
Write-Host ""
Init-Report
$drillStart = Get-Date
$overallStatus = "SUCCESS"
# 步骤 1: 查找备份文件
Write-Host "[1/6] Locating backup file..."
if ([string]::IsNullOrEmpty($BackupFile)) {
$backupPattern = Join-Path $BackupDir "db_backup_*.sql.gz"
$latestBackup = Get-ChildItem -Path $backupPattern -ErrorAction SilentlyContinue |
Sort-Object LastWriteTime -Descending |
Select-Object -First 1
if ($latestBackup) {
$BackupFile = $latestBackup.FullName
}
else {
Write-Host " FAIL: No backup file found in $BackupDir" -ForegroundColor Red
Step-Result "1 - 定位备份文件" "FAILED" "未找到备份文件于 $BackupDir"
Append-Report "## 演练结果: ❌ FAILED`n`n演练失败,未找到备份文件"
exit 1
}
}
if (-not (Test-Path $BackupFile)) {
Write-Host " FAIL: Backup file not found: $BackupFile" -ForegroundColor Red
Step-Result "1 - 定位备份文件" "FAILED" "备份文件不存在: $BackupFile"
exit 1
}
$backupSize = (Get-Item $BackupFile).Length
Write-Host " PASS: Found backup: $BackupFile ($backupSize bytes)" -ForegroundColor Green
Step-Result "1 - 定位备份文件" "PASSED" "备份文件: ``$BackupFile`` ($backupSize bytes)"
# 步骤 2: 创建测试数据库
Write-Host "[2/6] Creating test database..."
try {
Invoke-MySql -Query "DROP DATABASE IF EXISTS ``$TestDb``;" -Silent
Invoke-MySql -Query "CREATE DATABASE ``$TestDb`` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;" -Silent
Write-Host " PASS: Test database created: $TestDb" -ForegroundColor Green
Step-Result "2 - 创建测试数据库" "PASSED" "测试数据库 ``$TestDb`` 创建成功"
}
catch {
Write-Host " FAIL: Could not create test database" -ForegroundColor Red
Step-Result "2 - 创建测试数据库" "FAILED" "创建测试数据库 ``$TestDb`` 失败"
$overallStatus = "FAILED"
Append-Report "## 演练结果: ❌ FAILED"
exit 1
}
# 步骤 3: 从备份恢复到测试数据库
Write-Host "[3/6] Restoring backup to test database..."
$restoreStart = Get-Date
try {
# 使用 7z 或 gunzip 解压,然后管道到 mysql
# Windows 上可能需要使用 7z 或 .NET GZipStream
$tempSqlFile = [System.IO.Path]::GetTempFileName()
# 尝试使用 gunzip(如果可用)
if (Get-Command gunzip -ErrorAction SilentlyContinue) {
$process = Start-Process -FilePath "gunzip" -ArgumentList "-c", "`"$BackupFile`"" -NoNewWindow -RedirectStandardOutput $tempSqlFile -Wait -PassThru
}
# 尝试使用 7z
elseif (Get-Command 7z -ErrorAction SilentlyContinue) {
& 7z e -so "$BackupFile" | Out-File -FilePath $tempSqlFile -Encoding ASCII
}
# 使用 .NET GZipStream
else {
$inStream = [System.IO.File]::OpenRead($BackupFile)
$gzStream = New-Object System.IO.Compression.GZipStream($inStream, [System.IO.Compression.CompressionMode]::Decompress)
$reader = New-Object System.IO.StreamReader($gzStream, [System.Text.Encoding]::UTF8)
$content = $reader.ReadToEnd()
$reader.Close()
$gzStream.Close()
$inStream.Close()
Set-Content -Path $tempSqlFile -Value $content -Encoding UTF8 -NoNewline
}
# 执行恢复
$mysqlArgs = @("-h", $db.Host, "-P", $db.Port, "-u", $db.User, "-p$($db.Pass)", $TestDb)
Get-Content $tempSqlFile -Raw | & mysql @mysqlArgs 2>$null
Remove-Item $tempSqlFile -Force -ErrorAction SilentlyContinue
$restoreEnd = Get-Date
$restoreDuration = ($restoreEnd - $restoreStart).TotalSeconds
Write-Host " PASS: Restore completed in $([int]$restoreDuration)s" -ForegroundColor Green
Step-Result "3 - 从备份恢复" "PASSED" "恢复完成,耗时 $([int]$restoreDuration)"
}
catch {
Write-Host " FAIL: Restore failed: $_" -ForegroundColor Red
Step-Result "3 - 从备份恢复" "FAILED" "从备份恢复失败: $_"
$overallStatus = "FAILED"
if (-not $NoCleanup) {
try { Invoke-MySql -Query "DROP DATABASE IF EXISTS ``$TestDb``;" -Silent } catch {}
}
Append-Report "## 演练结果: ❌ FAILED"
exit 1
}
# 步骤 4: 数据完整性检查
Write-Host "[4/6] Running data integrity checks..."
$testTables = Invoke-MySql -Query "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TestDb';" -Silent -Scalar
$sourceTables = Invoke-MySql -Query "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$($db.DbName)';" -Silent -Scalar
Write-Host " Test DB tables: $testTables"
Write-Host " Source DB tables: $sourceTables"
$testRecords = Invoke-MySql -Query "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$TestDb';" -Silent -Scalar
$sourceRecords = Invoke-MySql -Query "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$($db.DbName)';" -Silent -Scalar
Write-Host " Test DB records: $testRecords"
Write-Host " Source DB records: $sourceRecords"
$integrityDetail = @"
| | | |
|------|--------|------|
| | $testTables | $sourceTables |
| () | $testRecords | $sourceRecords |
"@
if ([int]$testTables -ge [int]$sourceTables) {
Write-Host " PASS: Table count matches" -ForegroundColor Green
Step-Result "4 - 数据完整性检查" "PASSED" $integrityDetail
}
else {
Write-Host " WARN: Test DB has fewer tables than source" -ForegroundColor Yellow
Step-Result "4 - 数据完整性检查" "WARN" "$integrityDetail`n`n⚠️ 测试库表数量少于源库"
}
# 步骤 5: 冒烟测试
Write-Host "[5/6] Running smoke tests..."
$smokePassed = 0
$smokeFailed = 0
$smokeDetail = ""
# 测试 1: 检查 users 表
try {
$userCount = Invoke-MySql -Query "SELECT COUNT(*) FROM users;" -Database $TestDb -Silent -Scalar
$smokePassed++
$smokeDetail += "- ✅ users 表查询成功: $userCount 条记录`n"
Write-Host " PASS: users table query: $userCount records" -ForegroundColor Green
}
catch {
$smokeDetail += "- ⚠️ users 表不存在或查询失败`n"
Write-Host " WARN: users table not found or query failed" -ForegroundColor Yellow
}
# 测试 2: 检查 schools 表
try {
$schoolCount = Invoke-MySql -Query "SELECT COUNT(*) FROM schools;" -Database $TestDb -Silent -Scalar
$smokePassed++
$smokeDetail += "- ✅ schools 表查询成功: $schoolCount 条记录`n"
Write-Host " PASS: schools table query: $schoolCount records" -ForegroundColor Green
}
catch {
$smokeDetail += "- ⚠️ schools 表不存在或查询失败`n"
Write-Host " WARN: schools table not found or query failed" -ForegroundColor Yellow
}
# 测试 3: 基础表查询
try {
$baseTableCount = Invoke-MySql -Query "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TestDb' AND table_type='BASE TABLE';" -Silent -Scalar
if ([int]$baseTableCount -gt 0) {
$smokePassed++
$smokeDetail += "- ✅ 基础表查询成功: $baseTableCount 个基础表`n"
Write-Host " PASS: Base table query: $baseTableCount tables" -ForegroundColor Green
}
else {
$smokeFailed++
$smokeDetail += "- ❌ 基础表查询失败`n"
Write-Host " FAIL: Base table query failed" -ForegroundColor Red
}
}
catch {
$smokeFailed++
$smokeDetail += "- ❌ 基础表查询失败`n"
Write-Host " FAIL: Base table query failed" -ForegroundColor Red
}
Step-Result "5 - 冒烟测试" "PASSED" "通过: $smokePassed, 失败: $smokeFailed`n`n$smokeDetail"
# 步骤 6: 清理测试数据库
Write-Host "[6/6] Cleaning up test database..."
if ($NoCleanup) {
Write-Host " SKIP: Cleanup skipped (--NoCleanup)" -ForegroundColor Yellow
Step-Result "6 - 清理测试数据库" "SKIPPED" "演练后保留测试数据库 ``$TestDb``"
}
else {
try {
Invoke-MySql -Query "DROP DATABASE IF EXISTS ``$TestDb``;" -Silent
Write-Host " PASS: Test database dropped: $TestDb" -ForegroundColor Green
Step-Result "6 - 清理测试数据库" "PASSED" "测试数据库 ``$TestDb`` 已删除"
}
catch {
Write-Host " WARN: Could not drop test database (manual cleanup required)" -ForegroundColor Yellow
Step-Result "6 - 清理测试数据库" "WARN" "⚠️ 无法删除测试数据库 ``$TestDb``,需手动清理"
}
}
# 生成总结
$drillEnd = Get-Date
$drillDuration = ($drillEnd - $drillStart).TotalSeconds
Append-Report "## 演练结果"
Append-Report ""
if ($overallStatus -eq "SUCCESS") {
Append-Report "**状态**: ✅ 成功"
}
else {
Append-Report "**状态**: ❌ 失败"
}
Append-Report "**总耗时**: $([int]$drillDuration)"
Append-Report "**备份文件**: ``$BackupFile``"
Append-Report "**测试数据库**: ``$TestDb``"
Append-Report ""
Append-Report "## RTO/RPO 评估"
Append-Report ""
Append-Report "- **RTO 目标**: 4 小时"
Append-Report "- **本次恢复耗时**: $([int]$restoreDuration) 秒 ($([int]($restoreDuration / 60)) 分钟)"
if ($restoreDuration -lt 14400) {
Append-Report "- **RTO 评估**: ✅ 达标"
}
else {
Append-Report "- **RTO 评估**: ⚠️ 需关注"
}
Append-Report "- **RPO 目标**: 24 小时(取决于备份频率)"
Append-Report ""
Write-Host ""
Write-Host "=== Drill Summary ===" -ForegroundColor Cyan
Write-Host "Status: $overallStatus"
Write-Host "Duration: $([int]$drillDuration)s"
Write-Host "Report: $ReportFile"
Write-Host ""
if ($overallStatus -eq "SUCCESS") {
exit 0
}
else {
exit 1
}

369
scripts/dr-drill.sh Normal file
View File

@@ -0,0 +1,369 @@
#!/bin/bash
# 灾备演练脚本
# 用法: ./dr-drill.sh
# 自动化灾备演练:从备份恢复到测试数据库,验证数据完整性
set -u
show_help() {
cat <<EOF
用法: $0 [选项]
灾备演练脚本,自动化测试备份恢复流程
选项:
--backup FILE 指定备份文件(不指定则使用最新备份)
--test-db NAME 测试数据库名(默认 next_edu_dr_drill)
--no-cleanup 演练后不清理测试数据库
--report-dir DIR 报告输出目录(默认 docs/dr/reports)
--help, -h 显示帮助信息
环境变量:
DATABASE_URL 数据库连接 URL(必需)
BACKUP_DIR 备份目录(默认 ./backups)
DR_DRILL_TEST_DB 测试数据库名(默认 next_edu_dr_drill)
DR_DRILL_REPORT_DIR 报告目录(默认 docs/dr/reports)
退出码:
0 演练成功
1 演练失败
EOF
}
# 解析参数
BACKUP_FILE=""
NO_CLEANUP=0
REPORT_DIR=""
while [ $# -gt 0 ]; do
case "$1" in
--help|-h)
show_help
exit 0
;;
--backup)
if [ $# -lt 2 ]; then
echo "ERROR: --backup requires an argument" >&2
exit 1
fi
BACKUP_FILE="$2"
shift 2
;;
--test-db)
if [ $# -lt 2 ]; then
echo "ERROR: --test-db requires an argument" >&2
exit 1
fi
DR_DRILL_TEST_DB="$2"
shift 2
;;
--no-cleanup)
NO_CLEANUP=1
shift
;;
--report-dir)
if [ $# -lt 2 ]; then
echo "ERROR: --report-dir requires an argument" >&2
exit 1
fi
REPORT_DIR="$2"
shift 2
;;
*)
echo "ERROR: Unknown argument: $1" >&2
exit 1
;;
esac
done
# 配置
DATABASE_URL="${DATABASE_URL:-}"
BACKUP_DIR="${BACKUP_DIR:-./backups}"
TEST_DB="${DR_DRILL_TEST_DB:-next_edu_dr_drill}"
REPORT_DIR="${REPORT_DIR:-${DR_DRILL_REPORT_DIR:-docs/dr/reports}}"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
REPORT_FILE="$REPORT_DIR/dr_drill_${TIMESTAMP}.md"
# 检查 DATABASE_URL
if [ -z "$DATABASE_URL" ]; then
echo "ERROR: DATABASE_URL not set" >&2
exit 1
fi
# 解析 DATABASE_URL
DB_USER=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p')
DB_PASS=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p')
DB_HOST=$(echo "$DATABASE_URL" | sed -n 's/.*@\([^:]*\):.*/\1/p')
DB_PORT=$(echo "$DATABASE_URL" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
DB_NAME=$(echo "$DATABASE_URL" | sed -n 's/.*\/\([^?]*\).*/\1/p')
# 创建报告目录
mkdir -p "$REPORT_DIR"
# 初始化报告
init_report() {
cat > "$REPORT_FILE" <<EOF
# 灾备演练报告
- **演练时间**: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
- **测试数据库**: $TEST_DB
- **源数据库**: $DB_NAME
- **数据库主机**: $DB_HOST:$DB_PORT
- **备份文件**: $BACKUP_FILE
## 演练步骤
EOF
}
# 追加报告
append_report() {
echo "$1" >> "$REPORT_FILE"
}
# 记录步骤结果
step_result() {
local step="$1"
local status="$2"
local detail="$3"
append_report "### 步骤 $step: $status"
append_report ""
append_report "$detail"
append_report ""
if [ "$status" = "FAILED" ]; then
append_report "❌ 步骤失败"
else
append_report "✅ 步骤成功"
fi
append_report ""
echo "---"
}
echo "=== Disaster Recovery Drill ==="
echo "Time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
echo "Test DB: $TEST_DB"
echo "Source DB: $DB_NAME@$DB_HOST:$DB_PORT"
echo "Report: $REPORT_FILE"
echo ""
init_report
DRILL_START=$(date +%s)
OVERALL_STATUS="SUCCESS"
# 步骤 1: 查找备份文件
echo "[1/6] Locating backup file..."
if [ -z "$BACKUP_FILE" ]; then
BACKUP_FILE=$(ls -t "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | head -1)
if [ -z "$BACKUP_FILE" ]; then
echo " FAIL: No backup file found in $BACKUP_DIR"
step_result "1 - 定位备份文件" "FAILED" "未找到备份文件于 $BACKUP_DIR"
OVERALL_STATUS="FAILED"
append_report "## 演练结果: ❌ FAILED"
append_report ""
append_report "演练失败,未找到备份文件"
exit 1
fi
fi
if [ ! -f "$BACKUP_FILE" ]; then
echo " FAIL: Backup file not found: $BACKUP_FILE"
step_result "1 - 定位备份文件" "FAILED" "备份文件不存在: $BACKUP_FILE"
OVERALL_STATUS="FAILED"
append_report "## 演练结果: ❌ FAILED"
exit 1
fi
BACKUP_SIZE=$(stat -c%s "$BACKUP_FILE" 2>/dev/null || stat -f%z "$BACKUP_FILE" 2>/dev/null)
echo " PASS: Found backup: $BACKUP_FILE (${BACKUP_SIZE} bytes)"
step_result "1 - 定位备份文件" "PASSED" "备份文件: \`$BACKUP_FILE\` (${BACKUP_SIZE} bytes)"
# 步骤 2: 创建测试数据库
echo "[2/6] Creating test database..."
# 先删除已存在的测试数据库
mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null
if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "CREATE DATABASE \`$TEST_DB\` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;" 2>/dev/null; then
echo " PASS: Test database created: $TEST_DB"
step_result "2 - 创建测试数据库" "PASSED" "测试数据库 \`$TEST_DB\` 创建成功"
else
echo " FAIL: Could not create test database"
step_result "2 - 创建测试数据库" "FAILED" "创建测试数据库 \`$TEST_DB\` 失败"
OVERALL_STATUS="FAILED"
append_report "## 演练结果: ❌ FAILED"
exit 1
fi
# 步骤 3: 从备份恢复到测试数据库
echo "[3/6] Restoring backup to test database..."
RESTORE_START=$(date +%s)
if gunzip -c "$BACKUP_FILE" | mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" 2>/dev/null; then
RESTORE_END=$(date +%s)
RESTORE_DURATION=$((RESTORE_END - RESTORE_START))
echo " PASS: Restore completed in ${RESTORE_DURATION}s"
step_result "3 - 从备份恢复" "PASSED" "恢复完成,耗时 ${RESTORE_DURATION}"
else
echo " FAIL: Restore failed"
step_result "3 - 从备份恢复" "FAILED" "从备份恢复失败"
OVERALL_STATUS="FAILED"
# 尝试清理
if [ "$NO_CLEANUP" -eq 0 ]; then
mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null || true
fi
append_report "## 演练结果: ❌ FAILED"
exit 1
fi
# 步骤 4: 数据完整性检查
echo "[4/6] Running data integrity checks..."
# 获取测试数据库表数量
TEST_TABLES=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TEST_DB';" \
-s -N 2>/dev/null || echo 0)
# 获取源数据库表数量
SOURCE_TABLES=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$DB_NAME';" \
-s -N 2>/dev/null || echo 0)
echo " Test DB tables: $TEST_TABLES"
echo " Source DB tables: $SOURCE_TABLES"
# 获取测试数据库总记录数
TEST_RECORDS=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$TEST_DB';" \
-s -N 2>/dev/null || echo 0)
# 获取源数据库总记录数
SOURCE_RECORDS=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT SUM(table_rows) FROM information_schema.tables WHERE table_schema='$DB_NAME';" \
-s -N 2>/dev/null || echo 0)
echo " Test DB records: $TEST_RECORDS"
echo " Source DB records: $SOURCE_RECORDS"
INTEGRITY_DETAIL="| 指标 | 测试库 | 源库 |
|------|--------|------|
| 表数量 | $TEST_TABLES | $SOURCE_TABLES |
| 记录数(近似) | $TEST_RECORDS | $SOURCE_RECORDS |"
if [ "$TEST_TABLES" -ge "$SOURCE_TABLES" ]; then
echo " PASS: Table count matches"
step_result "4 - 数据完整性检查" "PASSED" "$INTEGRITY_DETAIL"
else
echo " WARN: Test DB has fewer tables than source"
step_result "4 - 数据完整性检查" "WARN" "$INTEGRITY_DETAIL
⚠️ 测试库表数量少于源库"
fi
# 步骤 5: 冒烟测试
echo "[5/6] Running smoke tests..."
SMOKE_PASSED=0
SMOKE_FAILED=0
SMOKE_DETAIL=""
# 测试 1: 检查 users 表(如果存在)
USER_COUNT=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \
-e "SELECT COUNT(*) FROM users;" -s -N 2>/dev/null || echo "N/A")
if [ "$USER_COUNT" != "N/A" ]; then
SMOKE_PASSED=$((SMOKE_PASSED + 1))
SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ users 表查询成功: ${USER_COUNT} 条记录
"
echo " PASS: users table query: $USER_COUNT records"
else
SMOKE_DETAIL="${SMOKE_DETAIL}- ⚠️ users 表不存在或查询失败
"
echo " WARN: users table not found or query failed"
fi
# 测试 2: 检查 schools 表(如果存在)
SCHOOL_COUNT=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \
-e "SELECT COUNT(*) FROM schools;" -s -N 2>/dev/null || echo "N/A")
if [ "$SCHOOL_COUNT" != "N/A" ]; then
SMOKE_PASSED=$((SMOKE_PASSED + 1))
SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ schools 表查询成功: ${SCHOOL_COUNT} 条记录
"
echo " PASS: schools table query: $SCHOOL_COUNT records"
else
SMOKE_DETAIL="${SMOKE_DETAIL}- ⚠️ schools 表不存在或查询失败
"
echo " WARN: schools table not found or query failed"
fi
# 测试 3: 执行简单 JOIN 查询(检查关系完整性)
JOIN_TEST=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" "$TEST_DB" \
-e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='$TEST_DB' AND table_type='BASE TABLE';" \
-s -N 2>/dev/null || echo "0")
if [ "$JOIN_TEST" -gt 0 ]; then
SMOKE_PASSED=$((SMOKE_PASSED + 1))
SMOKE_DETAIL="${SMOKE_DETAIL}- ✅ 基础表查询成功: ${JOIN_TEST} 个基础表
"
echo " PASS: Base table query: $JOIN_TEST tables"
else
SMOKE_DETAIL="${SMOKE_DETAIL}- ❌ 基础表查询失败
"
SMOKE_FAILED=$((SMOKE_FAILED + 1))
echo " FAIL: Base table query failed"
fi
step_result "5 - 冒烟测试" "PASSED" "通过: $SMOKE_PASSED, 失败: $SMOKE_FAILED
$SMOKE_DETAIL"
# 步骤 6: 清理测试数据库
echo "[6/6] Cleaning up test database..."
if [ "$NO_CLEANUP" -eq 1 ]; then
echo " SKIP: Cleanup skipped (--no-cleanup)"
step_result "6 - 清理测试数据库" "SKIPPED" "演练后保留测试数据库 \`$TEST_DB\`"
else
if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "DROP DATABASE IF EXISTS \`$TEST_DB\`;" 2>/dev/null; then
echo " PASS: Test database dropped: $TEST_DB"
step_result "6 - 清理测试数据库" "PASSED" "测试数据库 \`$TEST_DB\` 已删除"
else
echo " WARN: Could not drop test database (manual cleanup required)"
step_result "6 - 清理测试数据库" "WARN" "⚠️ 无法删除测试数据库 \`$TEST_DB\`,需手动清理"
fi
fi
# 生成总结
DRILL_END=$(date +%s)
DRILL_DURATION=$((DRILL_END - DRILL_START))
append_report "## 演练结果"
append_report ""
if [ "$OVERALL_STATUS" = "SUCCESS" ]; then
append_report "**状态**: ✅ 成功"
else
append_report "**状态**: ❌ 失败"
fi
append_report "**总耗时**: ${DRILL_DURATION}"
append_report "**备份文件**: \`$BACKUP_FILE\`"
append_report "**测试数据库**: \`$TEST_DB\`"
append_report ""
append_report "## RTO/RPO 评估"
append_report ""
append_report "- **RTO 目标**: 4 小时"
append_report "- **本次恢复耗时**: ${RESTORE_DURATION} 秒 ($(( RESTORE_DURATION / 60 )) 分钟)"
if [ -n "${RESTORE_DURATION:-}" ] && [ "$RESTORE_DURATION" -lt 14400 ]; then
append_report "- **RTO 评估**: ✅ 达标"
else
append_report "- **RTO 评估**: ⚠️ 需关注"
fi
append_report "- **RPO 目标**: 24 小时(取决于备份频率)"
append_report ""
echo ""
echo "=== Drill Summary ==="
echo "Status: $OVERALL_STATUS"
echo "Duration: ${DRILL_DURATION}s"
echo "Report: $REPORT_FILE"
echo ""
if [ "$OVERALL_STATUS" = "SUCCESS" ]; then
exit 0
else
exit 1
fi

419
scripts/failover.sh Normal file
View File

@@ -0,0 +1,419 @@
#!/bin/bash
# 故障切换脚本
# 用法: ./failover.sh [--auto] [--primary URL] [--standby URL]
# 用于主数据库故障时切换到备库
set -u
show_help() {
cat <<EOF
用法: $0 [选项]
数据库故障切换脚本,将应用从主库切换到备库
选项:
--auto 半自动模式(检测失败后自动切换,需先确认)
--primary URL 主库连接 URL(默认从 DATABASE_URL 读取)
--standby URL 备库连接 URL(必需,从 DATABASE_URL_STANDBY 读取)
--app-url URL 应用健康检查 URL(默认 http://localhost:8015)
--no-restart 不重启应用(仅更新配置)
--dry-run 演练模式,只输出步骤不实际执行
--help, -h 显示帮助信息
环境变量:
DATABASE_URL 主库连接 URL
DATABASE_URL_STANDBY 备库连接 URL(必需)
FAILOVER_APP_URL 应用健康检查 URL(默认 http://localhost:8015)
FAILOVER_APP_NAME 应用容器名(默认 nextjs-app)
FAILOVER_CONFIG_FILE 配置文件路径(默认 .env.local)
FAILOVER_LOG_FILE 切换日志路径(默认 docs/dr/logs/failover.log)
退出码:
0 切换成功
1 切换失败
EOF
}
# 解析参数
AUTO_MODE=0
PRIMARY_URL=""
STANDBY_URL=""
APP_URL=""
NO_RESTART=0
DRY_RUN=0
while [ $# -gt 0 ]; do
case "$1" in
--help|-h)
show_help
exit 0
;;
--auto)
AUTO_MODE=1
shift
;;
--primary)
if [ $# -lt 2 ]; then
echo "ERROR: --primary requires an argument" >&2
exit 1
fi
PRIMARY_URL="$2"
shift 2
;;
--standby)
if [ $# -lt 2 ]; then
echo "ERROR: --standby requires an argument" >&2
exit 1
fi
STANDBY_URL="$2"
shift 2
;;
--app-url)
if [ $# -lt 2 ]; then
echo "ERROR: --app-url requires an argument" >&2
exit 1
fi
APP_URL="$2"
shift 2
;;
--no-restart)
NO_RESTART=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
*)
echo "ERROR: Unknown argument: $1" >&2
exit 1
;;
esac
done
# 配置
PRIMARY_URL="${PRIMARY_URL:-${DATABASE_URL:-}}"
STANDBY_URL="${STANDBY_URL:-${DATABASE_URL_STANDBY:-}}"
APP_URL="${APP_URL:-${FAILOVER_APP_URL:-http://localhost:8015}}"
APP_NAME="${FAILOVER_APP_NAME:-nextjs-app}"
CONFIG_FILE="${FAILOVER_CONFIG_FILE:-.env.local}"
LOG_DIR="docs/dr/logs"
LOG_FILE="${FAILOVER_LOG_FILE:-$LOG_DIR/failover.log}"
# 检查必需参数
if [ -z "$STANDBY_URL" ]; then
echo "ERROR: Standby database URL not provided" >&2
echo "Set DATABASE_URL_STANDBY or use --standby" >&2
exit 1
fi
if [ -z "$PRIMARY_URL" ]; then
echo "ERROR: Primary database URL not provided" >&2
echo "Set DATABASE_URL or use --primary" >&2
exit 1
fi
# 创建日志目录
mkdir -p "$LOG_DIR"
# 日志函数
log() {
local timestamp
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "[$timestamp] $1" | tee -a "$LOG_FILE"
}
log_error() {
log "ERROR: $1" >&2
}
# 解析 DATABASE_URL
parse_db_url() {
local url="$1"
local user pass host port dbname
user=$(echo "$url" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p')
pass=$(echo "$url" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p')
host=$(echo "$url" | sed -n 's/.*@\([^:]*\):.*/\1/p')
port=$(echo "$url" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
dbname=$(echo "$url" | sed -n 's/.*\/\([^?]*\).*/\1/p')
echo "$user|$pass|$host|$port|$dbname"
}
# 检查数据库健康
check_db_health() {
local url="$1"
local parsed
parsed=$(parse_db_url "$url")
local user pass host port dbname
IFS='|' read -r user pass host port dbname <<EOF
$parsed
EOF
log "Checking database health: ${host}:${port}/${dbname}"
if [ "$DRY_RUN" -eq 1 ]; then
log " [DRY-RUN] Would check: mysql -h $host -P $port -u $user -e 'SELECT 1'"
return 0
fi
if mysql -h "$host" -P "$port" -u "$user" -p"$pass" -e "SELECT 1;" 2>/dev/null; then
log " Database is healthy"
return 0
else
log " Database is NOT reachable"
return 1
fi
}
# 检查应用健康
check_app_health() {
local url="$1"
log "Checking application health: $url"
if [ "$DRY_RUN" -eq 1 ]; then
log " [DRY-RUN] Would check: curl -f $url"
return 0
fi
if command -v curl >/dev/null 2>&1; then
if curl -sf -o /dev/null -m 10 "$url" 2>/dev/null; then
log " Application is healthy"
return 0
else
log " Application is NOT healthy"
return 1
fi
else
log " WARN: curl not available, skipping app health check"
return 0
fi
}
# 提升备库为主库(如果是主从架构)
promote_standby() {
log "Promoting standby to primary..."
local parsed
parsed=$(parse_db_url "$STANDBY_URL")
local user pass host port dbname
IFS='|' read -r user pass host port dbname <<EOF
$parsed
EOF
if [ "$DRY_RUN" -eq 1 ]; then
log " [DRY-RUN] Would promote standby: STOP SLAVE; RESET SLAVE ALL; SET GLOBAL read_only=OFF;"
return 0
fi
# 检查是否为从库
SLAVE_STATUS=$(mysql -h "$host" -P "$port" -u "$user" -p"$pass" \
-e "SHOW SLAVE STATUS\G" 2>/dev/null)
if [ -n "$SLAVE_STATUS" ]; then
log " Standby is a slave, promoting..."
# 停止复制
if mysql -h "$host" -P "$port" -u "$user" -p"$pass" \
-e "STOP SLAVE; RESET SLAVE ALL;" 2>/dev/null; then
log " Replication stopped and reset"
else
log_error "Failed to stop replication"
return 1
fi
# 关闭只读模式
if mysql -h "$host" -P "$port" -u "$user" -p"$pass" \
-e "SET GLOBAL read_only=OFF; SET GLOBAL super_read_only=OFF;" 2>/dev/null; then
log " Read-only mode disabled"
else
log_error "Failed to disable read-only mode"
return 1
fi
else
log " Standby is not a slave (standalone), skipping promotion"
fi
log " Standby promoted successfully"
return 0
}
# 更新应用配置
update_config() {
log "Updating application configuration..."
if [ "$DRY_RUN" -eq 1 ]; then
log " [DRY-RUN] Would update $CONFIG_FILE: DATABASE_URL=$STANDBY_URL"
return 0
fi
if [ -f "$CONFIG_FILE" ]; then
# 备份原配置
cp "$CONFIG_FILE" "${CONFIG_FILE}.bak.$(date +%s)"
log " Backed up original config to ${CONFIG_FILE}.bak.*"
# 更新 DATABASE_URL
if grep -q "^DATABASE_URL=" "$CONFIG_FILE"; then
sed -i.bak "s|^DATABASE_URL=.*|DATABASE_URL=$STANDBY_URL|" "$CONFIG_FILE"
rm -f "${CONFIG_FILE}.bak" 2>/dev/null || true
log " Updated DATABASE_URL in $CONFIG_FILE"
else
echo "DATABASE_URL=$STANDBY_URL" >> "$CONFIG_FILE"
log " Added DATABASE_URL to $CONFIG_FILE"
fi
else
log " WARN: Config file $CONFIG_FILE not found, creating new one"
echo "DATABASE_URL=$STANDBY_URL" > "$CONFIG_FILE"
fi
# 同时更新环境变量(供当前会话使用)
export DATABASE_URL="$STANDBY_URL"
log " Configuration updated"
return 0
}
# 重启应用
restart_app() {
if [ "$NO_RESTART" -eq 1 ]; then
log "Skipping application restart (--no-restart)"
return 0
fi
log "Restarting application..."
if [ "$DRY_RUN" -eq 1 ]; then
log " [DRY-RUN] Would restart: docker restart $APP_NAME"
return 0
fi
if command -v docker >/dev/null 2>&1; then
log " Restarting Docker container: $APP_NAME"
if docker restart "$APP_NAME" 2>/dev/null; then
log " Container restarted"
# 等待应用启动
log " Waiting for application to start..."
sleep 5
return 0
else
log_error "Failed to restart container $APP_NAME"
return 1
fi
else
log " WARN: Docker not available, please restart application manually"
log " Updated DATABASE_URL: $STANDBY_URL"
fi
return 0
}
# 主流程
log "========================================"
log "Database Failover Started"
log "========================================"
log "Mode: $([ "$AUTO_MODE" -eq 1 ] && echo "semi-auto" || echo "manual")"
log "Dry-run: $([ "$DRY_RUN" -eq 1 ] && echo "yes" || echo "no")"
log "Primary: $PRIMARY_URL"
log "Standby: $STANDBY_URL"
log ""
# 步骤 1: 检测主库健康状态
log "[1/5] Checking primary database health..."
PRIMARY_HEALTHY=0
if check_db_health "$PRIMARY_URL"; then
PRIMARY_HEALTHY=1
log " Primary is healthy"
if [ "$AUTO_MODE" -eq 0 ]; then
log " Primary is healthy. Failover not needed."
log " Use --auto to force failover even if primary is healthy"
log "========================================"
log "Failover Cancelled (Primary Healthy)"
log "========================================"
exit 0
fi
else
log " Primary is NOT healthy, proceeding with failover"
fi
# 半自动模式确认
if [ "$AUTO_MODE" -eq 1 ] && [ "$DRY_RUN" -eq 0 ]; then
echo ""
echo "WARNING: About to failover from primary to standby."
echo " Primary: $PRIMARY_URL"
echo " Standby: $STANDBY_URL"
echo ""
read -p "Type 'FAILover' to confirm: " CONFIRM
if [ "$CONFIRM" != "FAILover" ]; then
log "Failover cancelled by user"
exit 1
fi
fi
# 步骤 2: 检查备库健康
log ""
log "[2/5] Checking standby database health..."
if ! check_db_health "$STANDBY_URL"; then
log_error "Standby is also not healthy, cannot failover"
log "========================================"
log "Failover FAILED (Standby Unhealthy)"
log "========================================"
exit 1
fi
# 步骤 3: 提升备库为主库
log ""
log "[3/5] Promoting standby to primary..."
if ! promote_standby; then
log_error "Failed to promote standby"
exit 1
fi
# 步骤 4: 更新应用配置并重启
log ""
log "[4/5] Updating application configuration and restarting..."
update_config
if ! restart_app; then
log_error "Failed to restart application"
log " Manual intervention required"
exit 1
fi
# 步骤 5: 验证切换成功
log ""
log "[5/5] Verifying failover..."
sleep 3
# 检查应用健康
APP_HEALTHY=0
for i in 1 2 3 4 5; do
if check_app_health "$APP_URL"; then
APP_HEALTHY=1
break
fi
log " Retry $i/5 in 5 seconds..."
sleep 5
done
if [ "$APP_HEALTHY" -eq 0 ]; then
log_error "Application is not healthy after failover"
log " Check application logs and configuration"
log "========================================"
log "Failover FAILED (App Unhealthy)"
log "========================================"
exit 1
fi
# 检查数据库连接(通过应用)
log " Verifying database connection via application..."
if [ "$DRY_RUN" -eq 0 ]; then
if curl -sf -m 10 "$APP_URL" >/dev/null 2>&1; then
log " Application responding successfully"
else
log_error "Application not responding"
exit 1
fi
fi
log ""
log "========================================"
log "Failover Completed Successfully"
log "========================================"
log "Primary (old): $PRIMARY_URL"
log "Standby (new): $STANDBY_URL"
log "Application: $APP_URL"
log "Log file: $LOG_FILE"
log ""
log "Post-failover checklist:"
log " 1. Verify application functionality"
log " 2. Update monitoring alerts"
log " 3. Notify stakeholders"
log " 4. Plan primary database recovery"
log " 5. Schedule post-mortem review"
log ""
exit 0

253
scripts/health-check.sh Normal file
View File

@@ -0,0 +1,253 @@
#!/bin/bash
# 健康检查脚本
# 用法: ./health-check.sh
# 检查应用、数据库、磁盘空间、备份新鲜度,输出 JSON 报告
set -u
show_help() {
cat <<EOF
用法: $0 [选项]
系统健康检查脚本,输出 JSON 格式报告
选项:
--app-url URL 应用健康检查 URL(默认 http://localhost:8015)
--no-app 跳过应用健康检查
--no-db 跳过数据库检查
--no-disk 跳过磁盘空间检查
--no-backup 跳过备份新鲜度检查
--disk-threshold PCT 磁盘空间阈值百分比(默认 90)
--backup-max-age HRS 备份最大年龄(小时,默认 24)
--help, -h 显示帮助信息
环境变量:
DATABASE_URL 数据库连接 URL
HEALTH_CHECK_URL 应用健康检查 URL(默认 http://localhost:8015)
BACKUP_DIR 备份目录(默认 ./backups)
HEALTH_CHECK_DISK_THRESHOLD 磁盘阈值(默认 90)
HEALTH_CHECK_BACKUP_MAX_AGE 备份最大年龄(小时,默认 24)
退出码:
0 健康
1 异常
EOF
}
# 解析参数
CHECK_APP=1
CHECK_DB=1
CHECK_DISK=1
CHECK_BACKUP=1
APP_URL=""
DISK_THRESHOLD=""
BACKUP_MAX_AGE=""
while [ $# -gt 0 ]; do
case "$1" in
--help|-h)
show_help
exit 0
;;
--app-url)
if [ $# -lt 2 ]; then
echo "ERROR: --app-url requires an argument" >&2
exit 1
fi
APP_URL="$2"
shift 2
;;
--no-app) CHECK_APP=0; shift ;;
--no-db) CHECK_DB=0; shift ;;
--no-disk) CHECK_DISK=0; shift ;;
--no-backup) CHECK_BACKUP=0; shift ;;
--disk-threshold)
if [ $# -lt 2 ]; then
echo "ERROR: --disk-threshold requires an argument" >&2
exit 1
fi
DISK_THRESHOLD="$2"
shift 2
;;
--backup-max-age)
if [ $# -lt 2 ]; then
echo "ERROR: --backup-max-age requires an argument" >&2
exit 1
fi
BACKUP_MAX_AGE="$2"
shift 2
;;
*)
echo "ERROR: Unknown argument: $1" >&2
exit 1
;;
esac
done
# 配置
APP_URL="${APP_URL:-${HEALTH_CHECK_URL:-http://localhost:8015}}"
BACKUP_DIR="${BACKUP_DIR:-./backups}"
DISK_THRESHOLD="${DISK_THRESHOLD:-${HEALTH_CHECK_DISK_THRESHOLD:-90}}"
BACKUP_MAX_AGE="${BACKUP_MAX_AGE:-${HEALTH_CHECK_BACKUP_MAX_AGE:-24}}"
DATABASE_URL="${DATABASE_URL:-}"
# JSON 输出辅助函数
json_escape() {
echo "$1" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/\t/\\t/g'
}
# 检查结果数组
RESULTS=""
OVERALL_STATUS="healthy"
CHECKS_PASSED=0
CHECKS_FAILED=0
CHECKS_WARNED=0
add_result() {
local name="$1"
local status="$2"
local message="$3"
local detail="${4:-}"
local escaped_message escaped_detail
escaped_message=$(json_escape "$message")
escaped_detail=$(json_escape "$detail")
local result_entry
result_entry=" {\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$escaped_message\""
if [ -n "$detail" ]; then
result_entry="$result_entry, \"detail\": \"$escaped_detail\""
fi
result_entry="$result_entry }"
if [ -z "$RESULTS" ]; then
RESULTS="$result_entry"
else
RESULTS="$RESULTS,
$result_entry"
fi
case "$status" in
pass) CHECKS_PASSED=$((CHECKS_PASSED + 1)) ;;
fail) CHECKS_FAILED=$((CHECKS_FAILED + 1)); OVERALL_STATUS="unhealthy" ;;
warn) CHECKS_WARNED=$((CHECKS_WARNED + 1)); [ "$OVERALL_STATUS" = "healthy" ] && OVERALL_STATUS="degraded" ;;
esac
}
# 1. 应用健康检查
if [ "$CHECK_APP" -eq 1 ]; then
if command -v curl >/dev/null 2>&1; then
HTTP_CODE=$(curl -sf -o /dev/null -w "%{http_code}" -m 10 "$APP_URL" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "302" ]; then
add_result "app" "pass" "Application is healthy" "HTTP $HTTP_CODE from $APP_URL"
elif [ "$HTTP_CODE" = "000" ]; then
add_result "app" "fail" "Application is not reachable" "Cannot connect to $APP_URL"
else
add_result "app" "fail" "Application returned error" "HTTP $HTTP_CODE from $APP_URL"
fi
else
add_result "app" "warn" "curl not available, skipping app check" ""
fi
fi
# 2. 数据库连接检查
if [ "$CHECK_DB" -eq 1 ]; then
if [ -z "$DATABASE_URL" ]; then
add_result "database" "warn" "DATABASE_URL not set, skipping DB check" ""
else
# 解析 DATABASE_URL
DB_USER=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/\([^:]*\):.*/\1/p')
DB_PASS=$(echo "$DATABASE_URL" | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p')
DB_HOST=$(echo "$DATABASE_URL" | sed -n 's/.*@\([^:]*\):.*/\1/p')
DB_PORT=$(echo "$DATABASE_URL" | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
DB_NAME=$(echo "$DATABASE_URL" | sed -n 's/.*\/\([^?]*\).*/\1/p')
if command -v mysql >/dev/null 2>&1; then
if mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT 1;" 2>/dev/null; then
# 获取连接信息
DB_VERSION=$(mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" -p"$DB_PASS" \
-e "SELECT VERSION();" -s -N 2>/dev/null || echo "unknown")
add_result "database" "pass" "Database connection successful" "Host: $DB_HOST:$DB_PORT, DB: $DB_NAME, Version: $DB_VERSION"
else
add_result "database" "fail" "Database connection failed" "Cannot connect to $DB_HOST:$DB_PORT/$DB_NAME"
fi
else
add_result "database" "warn" "mysql client not available, skipping DB check" ""
fi
fi
fi
# 3. 磁盘空间检查
if [ "$CHECK_DISK" -eq 1 ]; then
# 获取根分区或当前目录所在分区的使用率
DISK_INFO=$(df -h . 2>/dev/null | tail -1)
if [ -n "$DISK_INFO" ]; then
DISK_USE_PCT=$(echo "$DISK_INFO" | awk '{print $5}' | sed 's/%//')
DISK_USE_HUMAN=$(echo "$DISK_INFO" | awk '{print $3}')
DISK_TOTAL_HUMAN=$(echo "$DISK_INFO" | awk '{print $2}')
DISK_AVAIL_HUMAN=$(echo "$DISK_INFO" | awk '{print $4}')
DISK_MOUNT=$(echo "$DISK_INFO" | awk '{print $6}')
if [ "$DISK_USE_PCT" -ge "$DISK_THRESHOLD" ]; then
add_result "disk" "fail" "Disk space critical" "Usage: ${DISK_USE_PCT}% (threshold: ${DISK_THRESHOLD}%), Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}"
elif [ "$DISK_USE_PCT" -ge $((DISK_THRESHOLD - 10)) ]; then
add_result "disk" "warn" "Disk space warning" "Usage: ${DISK_USE_PCT}% (threshold: ${DISK_THRESHOLD}%), Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}"
else
add_result "disk" "pass" "Disk space OK" "Usage: ${DISK_USE_PCT}%, Used: ${DISK_USE_HUMAN}/${DISK_TOTAL_HUMAN}, Available: ${DISK_AVAIL_HUMAN}, Mount: ${DISK_MOUNT}"
fi
else
add_result "disk" "warn" "Could not determine disk usage" ""
fi
fi
# 4. 备份新鲜度检查
if [ "$CHECK_BACKUP" -eq 1 ]; then
if [ -d "$BACKUP_DIR" ]; then
LATEST_BACKUP=$(ls -t "$BACKUP_DIR"/db_backup_*.sql.gz 2>/dev/null | head -1)
if [ -n "$LATEST_BACKUP" ]; then
# 获取备份文件修改时间(秒)
BACKUP_MTIME=$(stat -c%Y "$LATEST_BACKUP" 2>/dev/null || stat -f%m "$LATEST_BACKUP" 2>/dev/null)
CURRENT_TIME=$(date +%s)
BACKUP_AGE_HOURS=$(( (CURRENT_TIME - BACKUP_MTIME) / 3600 ))
BACKUP_SIZE=$(stat -c%s "$LATEST_BACKUP" 2>/dev/null || stat -f%z "$LATEST_BACKUP" 2>/dev/null)
BACKUP_SIZE_HUMAN=$(echo "$BACKUP_SIZE" | awk '{split("B KB MB GB TB",v);i=1;while($1>=1024&&i<5){$1/=1024;i++};printf "%.1f%s",$1,v[i]}')
if [ "$BACKUP_AGE_HOURS" -gt "$BACKUP_MAX_AGE" ]; then
add_result "backup" "fail" "Backup is stale" "Latest backup is ${BACKUP_AGE_HOURS}h old (max: ${BACKUP_MAX_AGE}h), File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN"
elif [ "$BACKUP_AGE_HOURS" -gt $((BACKUP_MAX_AGE / 2)) ]; then
add_result "backup" "warn" "Backup getting old" "Latest backup is ${BACKUP_AGE_HOURS}h old (max: ${BACKUP_MAX_AGE}h), File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN"
else
add_result "backup" "pass" "Backup is fresh" "Latest backup is ${BACKUP_AGE_HOURS}h old, File: $(basename "$LATEST_BACKUP"), Size: $BACKUP_SIZE_HUMAN"
fi
else
add_result "backup" "fail" "No backup files found" "No db_backup_*.sql.gz files in $BACKUP_DIR"
fi
else
add_result "backup" "warn" "Backup directory does not exist" "$BACKUP_DIR"
fi
fi
# 输出 JSON 报告
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
cat <<EOF
{
"timestamp": "$TIMESTAMP",
"status": "$OVERALL_STATUS",
"summary": {
"total": $((CHECKS_PASSED + CHECKS_FAILED + CHECKS_WARNED)),
"passed": $CHECKS_PASSED,
"failed": $CHECKS_FAILED,
"warned": $CHECKS_WARNED
},
"checks": [
$RESULTS
]
}
EOF
if [ "$OVERALL_STATUS" = "unhealthy" ]; then
exit 1
fi
exit 0

137
scripts/security-scan.ps1 Normal file
View File

@@ -0,0 +1,137 @@
# 本地安全扫描脚本 (Windows PowerShell)
# 用法: .\scripts\security-scan.ps1
# 功能: npm audit + Trivy 文件系统扫描,输出彩色报告
# 退出码: 0=无高危漏洞, 1=存在高危漏洞
$ErrorActionPreference = "Continue"
$ProjectRoot = Resolve-Path "$PSScriptRoot\.."
Set-Location $ProjectRoot
$script:HasHigh = 0
function Write-Header($msg) {
Write-Host "================================================" -ForegroundColor Cyan
Write-Host " $msg" -ForegroundColor Cyan
Write-Host "================================================" -ForegroundColor Cyan
}
function Write-Pass($msg) { Write-Host "[PASS] $msg" -ForegroundColor Green }
function Write-Warn2($msg) { Write-Host "[WARN] $msg" -ForegroundColor Yellow }
function Write-Fail($msg) { Write-Host "[FAIL] $msg" -ForegroundColor Red; $script:HasHigh = 1 }
function Write-Info2($msg) { Write-Host "[INFO] $msg" -ForegroundColor Blue }
function Test-Command($name) {
return [bool](Get-Command $name -ErrorAction SilentlyContinue)
}
Write-Header "本地安全扫描"
Write-Info2 "项目目录: $ProjectRoot"
Write-Host ""
# ------------------------------------------------
# 1. npm audit
# ------------------------------------------------
Write-Header "1/2 npm audit (依赖审计)"
if (-not (Test-Command "npm")) {
Write-Fail "未检测到 npm,请先安装 Node.js"
exit 1
}
$auditJson = "$env:TEMP\audit-report.json"
npm audit --json 2>$null | Out-File -FilePath $auditJson -Encoding utf8
if (Test-Path $auditJson) {
try {
$audit = Get-Content $auditJson -Raw | ConvertFrom-Json
$v = $audit.metadata.vulnerabilities
$critical = if ($v.critical) { [int]$v.critical } else { 0 }
$high = if ($v.high) { [int]$v.high } else { 0 }
$moderate = if ($v.moderate) { [int]$v.moderate } else { 0 }
$low = if ($v.low) { [int]$v.low } else { 0 }
Write-Host -NoNewline " critical: "; Write-Host -NoNewline "$critical " -ForegroundColor Red
Write-Host -NoNewline " high: "; Write-Host -NoNewline "$high " -ForegroundColor Red
Write-Host -NoNewline " moderate: "; Write-Host -NoNewline "$moderate " -ForegroundColor Yellow
Write-Host -NoNewline " low: "; Write-Host "$low" -ForegroundColor Green
if ($critical -gt 0 -or $high -gt 0) {
Write-Fail "npm audit 发现 critical/high 漏洞"
} else {
Write-Pass "npm audit 无 critical/high 漏洞"
}
} catch {
Write-Warn2 "npm audit 报告解析失败,显示原始输出"
npm audit --audit-level=moderate
}
Copy-Item $auditJson "$ProjectRoot\audit-report.json" -Force
Write-Info2 "报告已保存: audit-report.json"
} else {
Write-Warn2 "npm audit 未生成报告"
}
Write-Host ""
# ------------------------------------------------
# 2. Trivy 文件系统扫描
# ------------------------------------------------
Write-Header "2/2 Trivy FS Scan (文件系统扫描)"
if (-not (Test-Command "trivy")) {
Write-Warn2 "未检测到 trivy,跳过文件系统扫描"
Write-Info2 "安装 Trivy: https://aquasecurity.github.io/trivy/latest/getting-started/installation/"
} else {
$trivyReport = "$ProjectRoot\trivy-fs-report.json"
trivy fs --format json --output $trivyReport --exit-code 0 . 2>$null
if ($LASTEXITCODE -eq 0) {
Write-Pass "Trivy 扫描完成"
} else {
Write-Warn2 "Trivy 扫描返回非零状态(可能存在漏洞)"
}
if (Test-Path $trivyReport) {
try {
$trivy = Get-Content $trivyReport -Raw | ConvertFrom-Json
$allVulns = @()
foreach ($r in $trivy.Results) {
if ($r.Vulnerabilities) { $allVulns += $r.Vulnerabilities }
}
$total = $allVulns.Count
$critical = @($allVulns | Where-Object { $_.Severity -eq "CRITICAL" }).Count
$high = @($allVulns | Where-Object { $_.Severity -eq "HIGH" }).Count
$medium = @($allVulns | Where-Object { $_.Severity -eq "MEDIUM" }).Count
$low = @($allVulns | Where-Object { $_.Severity -eq "LOW" }).Count
Write-Host -NoNewline " 总计: $total critical: "; Write-Host -NoNewline "$critical " -ForegroundColor Red
Write-Host -NoNewline " high: "; Write-Host -NoNewline "$high " -ForegroundColor Red
Write-Host -NoNewline " medium: "; Write-Host -NoNewline "$medium " -ForegroundColor Yellow
Write-Host -NoNewline " low: "; Write-Host "$low" -ForegroundColor Green
if ($critical -gt 0 -or $high -gt 0) {
Write-Fail "Trivy 发现 critical/high 漏洞"
} else {
Write-Pass "Trivy 无 critical/high 漏洞"
}
Write-Info2 "报告已保存: trivy-fs-report.json"
} catch {
Write-Warn2 "Trivy 报告解析失败"
}
}
Write-Host ""
Write-Info2 "Trivy 表格视图:"
trivy fs --format table --exit-code 0 .
}
Write-Host ""
# ------------------------------------------------
# 汇总
# ------------------------------------------------
Write-Header "扫描汇总"
if ($script:HasHigh -eq 0) {
Write-Pass "未发现高危漏洞 (exit 0)"
exit 0
} else {
Write-Fail "发现高危漏洞,请尽快处理 (exit 1)"
Write-Host " SLA: critical 24h / high 7d / medium 30d / low 90d" -ForegroundColor Blue
exit 1
}

133
scripts/security-scan.sh Normal file
View File

@@ -0,0 +1,133 @@
#!/bin/bash
# 本地安全扫描脚本 (Linux/macOS)
# 用法: ./scripts/security-scan.sh
# 功能: npm audit + Trivy 文件系统扫描,输出彩色报告
# 退出码: 0=无高危漏洞, 1=存在高危漏洞
set -uo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$PROJECT_ROOT"
HAS_HIGH=0
print_header() {
echo -e "${CYAN}================================================${NC}"
echo -e "${CYAN} $1${NC}"
echo -e "${CYAN}================================================${NC}"
}
print_ok() { echo -e "${GREEN}[PASS]${NC} $1"; }
print_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
print_err() { echo -e "${RED}[FAIL]${NC} $1"; HAS_HIGH=1; }
print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
# 检查命令是否存在
command_exists() {
command -v "$1" >/dev/null 2>&1
}
print_header "本地安全扫描"
print_info "项目目录: $PROJECT_ROOT"
echo ""
# ------------------------------------------------
# 1. npm audit
# ------------------------------------------------
print_header "1/2 npm audit (依赖审计)"
if ! command_exists npm; then
print_err "未检测到 npm,请先安装 Node.js"
exit 1
fi
npm audit --json > /tmp/audit-report.json 2>/dev/null || true
if [ -f /tmp/audit-report.json ]; then
# 提取漏洞计数(需要 jq)
if command_exists jq; then
CRITICAL=$(jq -r '.metadata.vulnerabilities.critical // 0' /tmp/audit-report.json)
HIGH=$(jq -r '.metadata.vulnerabilities.high // 0' /tmp/audit-report.json)
MODERATE=$(jq -r '.metadata.vulnerabilities.moderate // 0' /tmp/audit-report.json)
LOW=$(jq -r '.metadata.vulnerabilities.low // 0' /tmp/audit-report.json)
echo -e " critical: ${RED}${CRITICAL}${NC} high: ${RED}${HIGH}${NC} moderate: ${YELLOW}${MODERATE}${NC} low: ${GREEN}${LOW}${NC}"
if [ "$CRITICAL" -gt 0 ] || [ "$HIGH" -gt 0 ]; then
print_err "npm audit 发现 critical/high 漏洞"
else
print_ok "npm audit 无 critical/high 漏洞"
fi
else
print_warn "未安装 jq,跳过漏洞计数,显示原始报告"
npm audit --audit-level=moderate || print_warn "npm audit 发现漏洞"
fi
# 保存报告到项目根目录
cp /tmp/audit-report.json "$PROJECT_ROOT/audit-report.json"
print_info "报告已保存: audit-report.json"
else
print_warn "npm audit 未生成报告"
fi
echo ""
# ------------------------------------------------
# 2. Trivy 文件系统扫描
# ------------------------------------------------
print_header "2/2 Trivy FS Scan (文件系统扫描)"
if ! command_exists trivy; then
print_warn "未检测到 trivy,跳过文件系统扫描"
print_info "安装 Trivy: https://aquasecurity.github.io/trivy/latest/getting-started/installation/"
else
TRIVY_REPORT="$PROJECT_ROOT/trivy-fs-report.json"
if trivy fs --format json --output "$TRIVY_REPORT" --exit-code 0 . >/dev/null 2>&1; then
print_ok "Trivy 扫描完成"
else
print_warn "Trivy 扫描返回非零状态(可能存在漏洞)"
fi
if [ -f "$TRIVY_REPORT" ] && command_exists jq; then
TOTAL=$(jq -r '[.Results[]?.Vulnerabilities[]?] | length' "$TRIVY_REPORT" 2>/dev/null || echo "0")
CRITICAL=$(jq -r '[.Results[]?.Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' "$TRIVY_REPORT" 2>/dev/null || echo "0")
HIGH=$(jq -r '[.Results[]?.Vulnerabilities[]? | select(.Severity=="HIGH")] | length' "$TRIVY_REPORT" 2>/dev/null || echo "0")
MEDIUM=$(jq -r '[.Results[]?.Vulnerabilities[]? | select(.Severity=="MEDIUM")] | length' "$TRIVY_REPORT" 2>/dev/null || echo "0")
LOW=$(jq -r '[.Results[]?.Vulnerabilities[]? | select(.Severity=="LOW")] | length' "$TRIVY_REPORT" 2>/dev/null || echo "0")
echo -e " 总计: ${TOTAL} critical: ${RED}${CRITICAL}${NC} high: ${RED}${HIGH}${NC} medium: ${YELLOW}${MEDIUM}${NC} low: ${GREEN}${LOW}${NC}"
if [ "$CRITICAL" -gt 0 ] || [ "$HIGH" -gt 0 ]; then
print_err "Trivy 发现 critical/high 漏洞"
else
print_ok "Trivy 无 critical/high 漏洞"
fi
print_info "报告已保存: trivy-fs-report.json"
fi
# 输出表格视图
echo ""
print_info "Trivy 表格视图:"
trivy fs --format table --exit-code 0 . || true
fi
echo ""
# ------------------------------------------------
# 汇总
# ------------------------------------------------
print_header "扫描汇总"
if [ "$HAS_HIGH" -eq 0 ]; then
print_ok "未发现高危漏洞 (exit 0)"
exit 0
else
print_err "发现高危漏洞,请尽快处理 (exit 1)"
echo -e " ${BLUE}SLA:${NC} critical 24h / high 7d / medium 30d / low 90d"
exit 1
fi