870 lines
32 KiB
Bash
870 lines
32 KiB
Bash
#!/bin/bash
|
||
#
|
||
# Review Redis 故障切换脚本
|
||
# 当生产环境 Redis 故障时,使用此脚本将 Review 环境切换为主服务
|
||
#
|
||
# 使用方法:
|
||
# ./failover-to-review.sh status - 查看当前状态
|
||
# ./failover-to-review.sh failover - 执行故障切换(提升 132 为 Master)
|
||
# ./failover-to-review.sh sentinel - 仅部署 Review Sentinel
|
||
# ./failover-to-review.sh full - 完整切换(failover + sentinel)
|
||
# ./failover-to-review.sh rollback - 安全回滚(先同步数据到生产,再切换)
|
||
#
|
||
# 安全回滚流程:
|
||
# 前置条件:【重要】必须先停止或暂停生产 Sentinel,防止自动故障转移
|
||
# 1. 让 10.56 先作为 132 的从节点同步数据(反向同步)
|
||
# 2. 等待数据同步完成
|
||
# 3. 将 132 设为只读,防止切换期间数据丢失
|
||
# 4. 提升 10.56 为主节点
|
||
# 5. 132 重新指向 10.56,恢复级联复制结构
|
||
# 6. 重新启动生产 Sentinel
|
||
#
|
||
# 数据一致性保证:
|
||
# - 回滚前:通过反向同步确保故障期间的数据同步到生产
|
||
# - 回滚中:将 132 设为只读(min-replicas-to-write=99)防止数据丢失
|
||
# - 回滚后:验证偏移量差异,确保数据完全同步
|
||
#
|
||
|
||
set -e
|
||
|
||
# ==================== 全局参数 ====================
|
||
# -y 或 --yes: 跳过确认提示(危险操作仍需确认)
|
||
# --force: 强制执行,跳过所有确认
|
||
AUTO_CONFIRM=false
|
||
FORCE_MODE=false
|
||
|
||
# 解析命令行参数
|
||
parse_global_args() {
|
||
for arg in "$@"; do
|
||
case $arg in
|
||
-y|--yes)
|
||
AUTO_CONFIRM=true
|
||
;;
|
||
--force)
|
||
AUTO_CONFIRM=true
|
||
FORCE_MODE=true
|
||
;;
|
||
esac
|
||
done
|
||
}
|
||
|
||
# 确认提示(如果 AUTO_CONFIRM=true 则自动确认)
|
||
confirm_prompt() {
|
||
local message=$1
|
||
local default=${2:-n} # 默认值 n
|
||
|
||
if [ "$AUTO_CONFIRM" = true ]; then
|
||
log_info "自动确认: $message"
|
||
return 0
|
||
fi
|
||
|
||
read -p "$message (y/n): " confirm
|
||
if [[ "$confirm" == "y" || "$confirm" == "Y" ]]; then
|
||
return 0
|
||
fi
|
||
return 1
|
||
}
|
||
|
||
# 危险操作确认(即使 AUTO_CONFIRM=true 也需要确认,除非 FORCE_MODE)
|
||
confirm_dangerous() {
|
||
local message=$1
|
||
|
||
if [ "$FORCE_MODE" = true ]; then
|
||
log_warn "强制模式: 跳过危险操作确认"
|
||
return 0
|
||
fi
|
||
|
||
read -p "$message (y/n): " confirm
|
||
if [[ "$confirm" == "y" || "$confirm" == "Y" ]]; then
|
||
return 0
|
||
fi
|
||
return 1
|
||
}
|
||
|
||
# ==================== 配置 ====================
|
||
REDIS_PASSWORD="sino#650"
|
||
SENTINEL_PASSWORD="sino#650"
|
||
|
||
# Review 节点
|
||
NODE_132="192.168.3.132"
|
||
NODE_133="192.168.3.133"
|
||
NODE_134="192.168.3.134"
|
||
|
||
# 生产节点
|
||
PROD_MASTER="192.168.10.56"
|
||
PROD_PORT=6379
|
||
|
||
REDIS_PORT=6379
|
||
SENTINEL_PORT=26379
|
||
|
||
# 脚本目录
|
||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
|
||
# ==================== 颜色输出 ====================
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m'
|
||
|
||
log_info() {
|
||
echo -e "${GREEN}[INFO]${NC} $1"
|
||
}
|
||
|
||
log_warn() {
|
||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
# ==================== Docker/Redis 命令封装 ====================
|
||
|
||
# Redis 镜像(用于临时容器执行命令)
|
||
REDIS_IMAGE="bitnami/redis:7.0.11"
|
||
|
||
# 通过 Docker 容器执行 Redis 命令
|
||
# 优先使用已运行的 review_redis_master 容器,否则使用临时容器
|
||
redis_cmd() {
|
||
local host=$1
|
||
local port=$2
|
||
shift 2
|
||
|
||
# 尝试获取运行中的 redis master 容器
|
||
local container_id=$(docker ps -q -f name=review_redis_master 2>/dev/null | head -1)
|
||
|
||
if [ -n "$container_id" ]; then
|
||
# 使用已运行的容器
|
||
docker exec "$container_id" redis-cli -h "$host" -p "$port" -a "$REDIS_PASSWORD" --no-auth-warning "$@" 2>/dev/null
|
||
else
|
||
# 使用临时容器执行命令
|
||
docker run --rm --network host "$REDIS_IMAGE" redis-cli -h "$host" -p "$port" -a "$REDIS_PASSWORD" --no-auth-warning "$@" 2>/dev/null
|
||
fi
|
||
}
|
||
|
||
# 检查 Redis 镜像是否存在,不存在则提示
|
||
check_redis_image() {
|
||
if ! docker image inspect "$REDIS_IMAGE" &>/dev/null; then
|
||
log_warn "Redis 镜像 $REDIS_IMAGE 不存在"
|
||
log_info "请先拉取镜像: docker pull $REDIS_IMAGE"
|
||
return 1
|
||
fi
|
||
return 0
|
||
}
|
||
|
||
# ==================== 功能函数 ====================
|
||
|
||
# 检查节点状态
|
||
check_status() {
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 当前 Redis 状态 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
|
||
for node in $NODE_132 $NODE_133 $NODE_134; do
|
||
echo ""
|
||
echo -e "${GREEN}[$node]${NC}"
|
||
|
||
local info=$(redis_cmd "$node" $REDIS_PORT INFO replication 2>/dev/null)
|
||
if [ -z "$info" ]; then
|
||
echo -e " ${RED}无法连接${NC}"
|
||
continue
|
||
fi
|
||
|
||
local role=$(echo "$info" | grep "^role:" | cut -d: -f2 | tr -d '\r\n')
|
||
local master_host=$(echo "$info" | grep "^master_host:" | cut -d: -f2 | tr -d '\r\n')
|
||
local master_port=$(echo "$info" | grep "^master_port:" | cut -d: -f2 | tr -d '\r\n')
|
||
local master_link=$(echo "$info" | grep "^master_link_status:" | cut -d: -f2 | tr -d '\r\n')
|
||
local slaves=$(echo "$info" | grep "^connected_slaves:" | cut -d: -f2 | tr -d '\r\n')
|
||
|
||
echo -e " 角色: ${YELLOW}$role${NC}"
|
||
if [ "$role" = "slave" ]; then
|
||
echo -e " 主节点: $master_host:$master_port"
|
||
echo -e " 同步状态: $master_link"
|
||
else
|
||
echo -e " 从节点数: $slaves"
|
||
fi
|
||
done
|
||
|
||
echo ""
|
||
}
|
||
|
||
# 检查生产环境状态
|
||
check_prod_status() {
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 生产环境 Redis 状态 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo ""
|
||
|
||
local info=$(redis_cmd "$PROD_MASTER" $PROD_PORT INFO replication 2>/dev/null)
|
||
if [ -z "$info" ]; then
|
||
echo -e "${RED}生产环境 $PROD_MASTER:$PROD_PORT 无法连接${NC}"
|
||
return 1
|
||
fi
|
||
|
||
local role=$(echo "$info" | grep "^role:" | cut -d: -f2 | tr -d '\r\n')
|
||
echo -e "生产主节点 $PROD_MASTER: ${GREEN}$role${NC}"
|
||
return 0
|
||
}
|
||
|
||
# 提升 132 为 Master
|
||
promote_132() {
|
||
echo ""
|
||
log_info "正在提升 $NODE_132 为 Master..."
|
||
|
||
# 执行 SLAVEOF NO ONE
|
||
local result=$(redis_cmd "$NODE_132" $REDIS_PORT SLAVEOF NO ONE)
|
||
if [ "$result" != "OK" ]; then
|
||
log_error "SLAVEOF NO ONE 执行失败: $result"
|
||
return 1
|
||
fi
|
||
|
||
sleep 2
|
||
|
||
# 验证角色
|
||
local role=$(redis_cmd "$NODE_132" $REDIS_PORT INFO replication | grep "^role:" | cut -d: -f2 | tr -d '\r\n')
|
||
if [ "$role" = "master" ]; then
|
||
log_info "$NODE_132 已成功提升为 Master"
|
||
else
|
||
log_error "$NODE_132 提升失败,当前角色: $role"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 配置从节点指向新 Master
|
||
configure_slaves() {
|
||
echo ""
|
||
log_info "正在配置从节点指向 $NODE_132..."
|
||
|
||
for node in $NODE_133 $NODE_134; do
|
||
local master_host=$(redis_cmd "$node" $REDIS_PORT INFO replication | grep "^master_host:" | cut -d: -f2 | tr -d '\r\n')
|
||
|
||
if [ "$master_host" = "$NODE_132" ]; then
|
||
log_info "[$node] 已指向 $NODE_132"
|
||
else
|
||
log_warn "[$node] 当前指向 $master_host,正在重新配置..."
|
||
redis_cmd "$node" $REDIS_PORT SLAVEOF "$NODE_132" $REDIS_PORT
|
||
sleep 1
|
||
|
||
# 验证
|
||
master_host=$(redis_cmd "$node" $REDIS_PORT INFO replication | grep "^master_host:" | cut -d: -f2 | tr -d '\r\n')
|
||
if [ "$master_host" = "$NODE_132" ]; then
|
||
log_info "[$node] 配置成功"
|
||
else
|
||
log_error "[$node] 配置失败"
|
||
fi
|
||
fi
|
||
done
|
||
}
|
||
|
||
# 部署 Sentinel
|
||
deploy_sentinel() {
|
||
echo ""
|
||
log_info "正在部署 Review Sentinel..."
|
||
|
||
cd "$SCRIPT_DIR"
|
||
|
||
if [ ! -f "./docker-compose-sentinel.yml" ]; then
|
||
log_error "未找到 docker-compose-sentinel.yml"
|
||
return 1
|
||
fi
|
||
|
||
if [ ! -f "./env_review" ]; then
|
||
log_error "未找到 env_review"
|
||
return 1
|
||
fi
|
||
|
||
# 部署
|
||
env $(cat ./env_review | xargs) envsubst < ./docker-compose-sentinel.yml | docker stack deploy --compose-file - review_sentinel
|
||
|
||
log_info "Sentinel 部署命令已执行,等待服务启动..."
|
||
sleep 10
|
||
|
||
# 检查 Sentinel 状态
|
||
log_info "检查 Sentinel 状态..."
|
||
for node in $NODE_132 $NODE_133 $NODE_134; do
|
||
local master=$(redis_cmd "$node" $SENTINEL_PORT SENTINEL get-master-addr-by-name reviewmaster 2>/dev/null)
|
||
if [ -n "$master" ]; then
|
||
log_info "[$node:$SENTINEL_PORT] Sentinel 运行正常,Master: $master"
|
||
else
|
||
log_warn "[$node:$SENTINEL_PORT] Sentinel 未就绪或未响应"
|
||
fi
|
||
done
|
||
}
|
||
|
||
# 移除 Sentinel
|
||
remove_sentinel() {
|
||
echo ""
|
||
log_info "正在移除 Review Sentinel..."
|
||
docker stack rm review_sentinel 2>/dev/null || true
|
||
sleep 5
|
||
log_info "Sentinel 已移除"
|
||
}
|
||
|
||
# ==================== 安全回滚相关函数 ====================
|
||
|
||
# 检查生产节点是否可连接
|
||
check_prod_connectable() {
|
||
local info=$(redis_cmd "$PROD_MASTER" $PROD_PORT PING 2>/dev/null)
|
||
if [ "$info" = "PONG" ]; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 获取节点的复制偏移量
|
||
get_repl_offset() {
|
||
local host=$1
|
||
local info=$(redis_cmd "$host" $REDIS_PORT INFO replication 2>/dev/null)
|
||
local offset=$(echo "$info" | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r\n')
|
||
if [ -z "$offset" ]; then
|
||
offset=$(echo "$info" | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r\n')
|
||
fi
|
||
echo "$offset"
|
||
}
|
||
|
||
# 获取节点角色
|
||
get_node_role() {
|
||
local host=$1
|
||
local info=$(redis_cmd "$host" $REDIS_PORT INFO replication 2>/dev/null)
|
||
echo "$info" | grep "^role:" | cut -d: -f2 | tr -d '\r\n'
|
||
}
|
||
|
||
# 获取从节点同步状态
|
||
get_slave_sync_status() {
|
||
local host=$1
|
||
local info=$(redis_cmd "$host" $REDIS_PORT INFO replication 2>/dev/null)
|
||
echo "$info" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r\n'
|
||
}
|
||
|
||
# 远程执行生产节点命令(尝试自动执行,失败则提示手动)
|
||
exec_prod_redis_cmd() {
|
||
local redis_args="$@"
|
||
local result=""
|
||
|
||
# 尝试通过网络直接执行命令
|
||
log_info "尝试远程执行 Redis 命令..."
|
||
result=$(redis_cmd "$PROD_MASTER" $PROD_PORT $redis_args 2>/dev/null)
|
||
|
||
if [ -n "$result" ]; then
|
||
echo "$result"
|
||
return 0
|
||
fi
|
||
|
||
# 无法远程执行,提示手动操作
|
||
log_warn "无法远程执行命令,请在生产服务器上手动执行"
|
||
return 1
|
||
}
|
||
|
||
# 步骤 1: 让生产 10.56 作为 132 的从节点
|
||
rollback_step1_make_prod_slave() {
|
||
echo ""
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 步骤 1: 配置生产节点为 132 的从节点 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
|
||
# 检查 132 当前是否为 Master
|
||
local role_132=$(get_node_role "$NODE_132")
|
||
if [ "$role_132" != "master" ]; then
|
||
log_error "132 当前不是 Master (角色: $role_132),无法执行回滚"
|
||
log_error "请先确认 132 已通过 failover 提升为 Master"
|
||
return 1
|
||
fi
|
||
log_info "确认 132 当前为 Master"
|
||
|
||
# 检查生产节点是否可连接
|
||
if ! check_prod_connectable; then
|
||
log_error "生产节点 $PROD_MASTER 无法连接,请先确保生产 Redis 已恢复"
|
||
return 1
|
||
fi
|
||
log_info "生产节点 $PROD_MASTER 可连接"
|
||
|
||
# 配置 10.56 作为 132 的从节点
|
||
log_info "正在配置 $PROD_MASTER 作为 $NODE_132 的从节点..."
|
||
|
||
# 尝试直接远程执行
|
||
local result=$(redis_cmd "$PROD_MASTER" $PROD_PORT SLAVEOF "$NODE_132" $REDIS_PORT 2>/dev/null)
|
||
|
||
if [ "$result" = "OK" ]; then
|
||
log_info "远程执行成功: SLAVEOF $NODE_132 $REDIS_PORT"
|
||
else
|
||
# 无法远程执行,提示手动操作
|
||
echo ""
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo -e "${YELLOW} 无法远程执行,请在生产服务器 ($PROD_MASTER) 上执行: ${NC}"
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo ""
|
||
echo "# 方法1: 进入 Redis 容器执行"
|
||
echo "docker exec -it \$(docker ps -q -f name=redis) redis-cli -a '$REDIS_PASSWORD' --no-auth-warning SLAVEOF $NODE_132 $REDIS_PORT"
|
||
echo ""
|
||
echo "# 方法2: 使用临时容器执行"
|
||
echo "docker run --rm --network host $REDIS_IMAGE redis-cli -h $PROD_MASTER -p $PROD_PORT -a '$REDIS_PASSWORD' --no-auth-warning SLAVEOF $NODE_132 $REDIS_PORT"
|
||
echo ""
|
||
|
||
if ! confirm_dangerous "已在生产服务器执行上述命令?"; then
|
||
log_warn "操作已取消"
|
||
return 1
|
||
fi
|
||
fi
|
||
|
||
# 验证配置
|
||
sleep 2
|
||
local prod_role=$(get_node_role "$PROD_MASTER")
|
||
local prod_sync=$(get_slave_sync_status "$PROD_MASTER")
|
||
|
||
if [ "$prod_role" = "slave" ]; then
|
||
log_info "生产节点已配置为从节点"
|
||
log_info "同步状态: $prod_sync"
|
||
if [ "$prod_sync" = "up" ]; then
|
||
log_info "数据同步连接正常"
|
||
else
|
||
log_warn "同步连接状态: $prod_sync,请等待连接建立"
|
||
fi
|
||
else
|
||
log_error "配置失败,生产节点角色: $prod_role"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 步骤 2: 等待数据同步完成
|
||
rollback_step2_wait_sync() {
|
||
echo ""
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 步骤 2: 等待数据同步完成 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
|
||
local max_wait=300 # 最大等待 5 分钟
|
||
local wait_time=0
|
||
local check_interval=5
|
||
|
||
while [ $wait_time -lt $max_wait ]; do
|
||
local master_offset=$(get_repl_offset "$NODE_132")
|
||
local slave_offset=$(redis_cmd "$PROD_MASTER" $REDIS_PORT INFO replication 2>/dev/null | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r\n')
|
||
local sync_status=$(get_slave_sync_status "$PROD_MASTER")
|
||
|
||
echo -e " Master(132) 偏移量: ${GREEN}$master_offset${NC}"
|
||
echo -e " Slave(10.56) 偏移量: ${YELLOW}$slave_offset${NC}"
|
||
echo -e " 同步状态: $sync_status"
|
||
echo -e " 偏移量差异: $((master_offset - slave_offset))"
|
||
|
||
if [ "$sync_status" != "up" ]; then
|
||
log_warn "同步连接未建立,等待中..."
|
||
sleep $check_interval
|
||
wait_time=$((wait_time + check_interval))
|
||
continue
|
||
fi
|
||
|
||
# 检查偏移量差异
|
||
local offset_diff=$((master_offset - slave_offset))
|
||
if [ $offset_diff -lt 1000 ]; then
|
||
log_info "数据同步基本完成(偏移量差异: $offset_diff)"
|
||
|
||
# 停止 Review 写入后再次确认
|
||
echo ""
|
||
log_warn "建议:在执行下一步前,先停止 Review 应用对 Redis 的写入"
|
||
if confirm_prompt "是否已停止写入并继续?"; then
|
||
sleep 2
|
||
master_offset=$(get_repl_offset "$NODE_132")
|
||
slave_offset=$(redis_cmd "$PROD_MASTER" $REDIS_PORT INFO replication 2>/dev/null | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r\n')
|
||
offset_diff=$((master_offset - slave_offset))
|
||
|
||
if [ $offset_diff -eq 0 ]; then
|
||
log_info "数据完全同步(偏移量差异: 0)"
|
||
return 0
|
||
else
|
||
log_info "当前偏移量差异: $offset_diff"
|
||
# 偏移量差异小于 100 字节时自动继续
|
||
if [ $offset_diff -lt 100 ] || confirm_prompt "偏移量差异 $offset_diff 字节,是否继续执行切换?"; then
|
||
return 0
|
||
fi
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
echo "等待同步... ($wait_time/$max_wait 秒)"
|
||
sleep $check_interval
|
||
wait_time=$((wait_time + check_interval))
|
||
echo ""
|
||
done
|
||
|
||
log_error "同步超时,请检查网络和 Redis 状态"
|
||
if confirm_dangerous "是否强制继续?"; then
|
||
return 0
|
||
fi
|
||
return 1
|
||
}
|
||
|
||
# 步骤 3: 切换主从关系
|
||
rollback_step3_switch_master() {
|
||
echo ""
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 步骤 3: 切换主从关系 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
|
||
# 3.0 先将 132 设为只读,防止切换期间有新数据写入
|
||
log_info "将 132 设为只读模式,防止切换期间数据写入..."
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET min-replicas-to-write 99
|
||
log_info "132 已设为只读(min-replicas-to-write=99)"
|
||
|
||
# 等待最后的数据同步到 10.56
|
||
log_info "等待最后的数据同步..."
|
||
sleep 3
|
||
|
||
# 再次确认偏移量
|
||
local master_offset=$(get_repl_offset "$NODE_132")
|
||
local slave_offset=$(redis_cmd "$PROD_MASTER" $REDIS_PORT INFO replication 2>/dev/null | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r\n')
|
||
local offset_diff=$((master_offset - slave_offset))
|
||
log_info "最终偏移量差异: $offset_diff"
|
||
|
||
# 偏移量差异小于 100 字节时自动继续,否则需要确认
|
||
if [ $offset_diff -gt 100 ]; then
|
||
log_warn "仍有 $offset_diff 字节数据未同步"
|
||
if ! confirm_dangerous "是否继续切换?"; then
|
||
# 恢复 132 可写
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET min-replicas-to-write 0
|
||
log_warn "已取消,132 已恢复可写"
|
||
return 1
|
||
fi
|
||
elif [ $offset_diff -gt 0 ]; then
|
||
log_info "偏移量差异 $offset_diff 字节(<100),自动继续"
|
||
fi
|
||
|
||
# 3.1 提升 10.56 为主节点
|
||
log_info "正在提升 $PROD_MASTER 为主节点..."
|
||
|
||
# 尝试直接远程执行
|
||
local result=$(redis_cmd "$PROD_MASTER" $PROD_PORT SLAVEOF NO ONE 2>/dev/null)
|
||
|
||
if [ "$result" = "OK" ]; then
|
||
log_info "远程执行成功: SLAVEOF NO ONE"
|
||
else
|
||
# 无法远程执行,提示手动操作
|
||
echo ""
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo -e "${YELLOW} 无法远程执行,请在生产服务器 ($PROD_MASTER) 上执行: ${NC}"
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo ""
|
||
echo "# 方法1: 进入 Redis 容器执行"
|
||
echo "docker exec -it \$(docker ps -q -f name=redis) redis-cli -a '$REDIS_PASSWORD' --no-auth-warning SLAVEOF NO ONE"
|
||
echo ""
|
||
echo "# 方法2: 使用临时容器执行"
|
||
echo "docker run --rm --network host $REDIS_IMAGE redis-cli -h $PROD_MASTER -p $PROD_PORT -a '$REDIS_PASSWORD' --no-auth-warning SLAVEOF NO ONE"
|
||
echo ""
|
||
|
||
if ! confirm_dangerous "已执行上述命令?"; then
|
||
# 恢复 132 可写
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET min-replicas-to-write 0
|
||
log_warn "已取消,132 已恢复可写"
|
||
return 1
|
||
fi
|
||
fi
|
||
|
||
sleep 2
|
||
local prod_role=$(get_node_role "$PROD_MASTER")
|
||
if [ "$prod_role" = "master" ]; then
|
||
log_info "$PROD_MASTER 已提升为 Master"
|
||
else
|
||
log_error "提升失败,当前角色: $prod_role"
|
||
# 恢复 132 可写
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET min-replicas-to-write 0
|
||
return 1
|
||
fi
|
||
|
||
# 3.2 配置 132 指向生产(立即执行,减少时间窗口)
|
||
log_info "正在配置 $NODE_132 指向 $PROD_MASTER..."
|
||
redis_cmd "$NODE_132" $REDIS_PORT SLAVEOF "$PROD_MASTER" $PROD_PORT
|
||
|
||
# 恢复 132 可写设置(作为从节点后这个设置不影响,但保持配置干净)
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET min-replicas-to-write 0
|
||
|
||
# 恢复隐藏配置
|
||
log_info "恢复 132 的隐藏从节点配置..."
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET replica-announced no
|
||
redis_cmd "$NODE_132" $REDIS_PORT CONFIG SET replica-priority 0
|
||
|
||
sleep 2
|
||
local role_132=$(get_node_role "$NODE_132")
|
||
local sync_132=$(get_slave_sync_status "$NODE_132")
|
||
if [ "$role_132" = "slave" ] && [ "$sync_132" = "up" ]; then
|
||
log_info "$NODE_132 已配置为从节点,同步状态: $sync_132"
|
||
else
|
||
log_warn "$NODE_132 配置完成,角色: $role_132, 同步状态: $sync_132"
|
||
fi
|
||
|
||
# 3.3 配置 133/134 指向 132
|
||
log_info "正在配置从节点指向 $NODE_132..."
|
||
for node in $NODE_133 $NODE_134; do
|
||
redis_cmd "$node" $REDIS_PORT SLAVEOF "$NODE_132" $REDIS_PORT
|
||
sleep 1
|
||
local role=$(get_node_role "$node")
|
||
local sync=$(get_slave_sync_status "$node")
|
||
log_info "[$node] 角色: $role, 同步状态: $sync"
|
||
done
|
||
}
|
||
|
||
# 安全回滚主函数
|
||
safe_rollback() {
|
||
echo ""
|
||
echo -e "${RED}╔════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${RED}║ 安全回滚流程 ║${NC}"
|
||
echo -e "${RED}║ ║${NC}"
|
||
echo -e "${RED}║ 此流程会先将故障期间 132 的新数据同步到生产 10.56, ║${NC}"
|
||
echo -e "${RED}║ 确保数据不丢失后再切换回原有架构。 ║${NC}"
|
||
echo -e "${RED}╚════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
echo "回滚流程说明:"
|
||
echo " 1. 让 10.56 先作为 132 的从节点(反向同步)"
|
||
echo " 2. 等待数据完全同步"
|
||
echo " 3. 提升 10.56 为主节点,132 重新作为隐藏从节点"
|
||
echo " 4. 恢复 133/134 级联复制"
|
||
echo ""
|
||
echo "前置条件:"
|
||
echo " - 132 当前是 Master(故障切换状态)"
|
||
echo " - 10.56 已恢复并可连接"
|
||
echo -e " - ${YELLOW}【重要】所有应用已切换到 Review Sentinel${NC}"
|
||
echo -e " - ${YELLOW}【重要】生产 Sentinel 已停止${NC}"
|
||
echo " - 建议先停止 Review 应用的写入"
|
||
echo ""
|
||
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo -e "${YELLOW} ⚠️ 生产 Sentinel 处理提醒 ${NC}"
|
||
echo -e "${YELLOW}========================================${NC}"
|
||
echo ""
|
||
echo "在执行回滚前,必须先停止生产 Sentinel!"
|
||
echo ""
|
||
echo "原因:"
|
||
echo " 1. 回滚步骤1会让 10.56 变成 132 的从节点"
|
||
echo " 2. 如果 Sentinel 还在运行,会检测到 master 下线并触发故障转移"
|
||
echo " 3. 此外,任何仍通过生产 Sentinel 连接的应用会收到 READONLY 错误"
|
||
echo ""
|
||
echo -e "${RED}【重要】请确认以下条件:${NC}"
|
||
echo " 1. 所有应用都已切换到 Review Sentinel (reviewmaster)"
|
||
echo " 2. 生产 Sentinel 已停止"
|
||
echo ""
|
||
echo "停止生产 Sentinel 命令(在生产服务器执行):"
|
||
echo " docker stack rm prod_sentinel"
|
||
echo " # 或者根据实际部署名称调整"
|
||
echo ""
|
||
|
||
# 这是危险操作,必须确认(除非 --force)
|
||
if ! confirm_dangerous "已确认所有应用已切换且生产 Sentinel 已停止?"; then
|
||
log_warn "请先完成上述操作后再执行回滚"
|
||
return 1
|
||
fi
|
||
|
||
check_prod_status || {
|
||
log_error "生产环境不可用,无法执行回滚"
|
||
return 1
|
||
}
|
||
check_status
|
||
|
||
if ! confirm_prompt "确认开始安全回滚?"; then
|
||
echo "已取消"
|
||
return 0
|
||
fi
|
||
|
||
# 移除 Review Sentinel
|
||
remove_sentinel
|
||
|
||
# 执行三个步骤
|
||
rollback_step1_make_prod_slave || return 1
|
||
rollback_step2_wait_sync || return 1
|
||
rollback_step3_switch_master || return 1
|
||
|
||
echo ""
|
||
echo -e "${GREEN}========================================${NC}"
|
||
echo -e "${GREEN} 安全回滚完成 ${NC}"
|
||
echo -e "${GREEN}========================================${NC}"
|
||
check_status
|
||
|
||
echo ""
|
||
echo "后续操作:"
|
||
echo " 1. 重新启动生产 Sentinel(如果之前停止了)"
|
||
echo " 2. 验证生产 Sentinel 是否正常识别 10.56 为主节点"
|
||
echo " 3. 修改应用配置,连接回生产 Sentinel"
|
||
echo " 4. 确认 132/133/134 对生产 Sentinel 不可见"
|
||
}
|
||
|
||
# 旧的不安全回滚(保留但标记警告)
|
||
unsafe_rollback() {
|
||
echo ""
|
||
echo -e "${RED}╔════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${RED}║ ⚠️ 警告 ⚠️ ║${NC}"
|
||
echo -e "${RED}║ ║${NC}"
|
||
echo -e "${RED}║ 此操作会直接让 132 重新指向 10.56,可能导致数据丢失! ║${NC}"
|
||
echo -e "${RED}║ 故障期间 132 上的所有写入数据将被覆盖! ║${NC}"
|
||
echo -e "${RED}║ ║${NC}"
|
||
echo -e "${RED}║ 推荐使用: ./failover-to-review.sh rollback ║${NC}"
|
||
echo -e "${RED}╚════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
read -p "确认执行不安全回滚(数据可能丢失)? 输入 'YES' 确认: " confirm
|
||
if [ "$confirm" != "YES" ]; then
|
||
echo "已取消"
|
||
return 0
|
||
fi
|
||
|
||
log_warn "正在执行不安全回滚..."
|
||
|
||
# 先移除 Sentinel
|
||
remove_sentinel
|
||
|
||
# 重新配置 132 指向生产
|
||
log_info "配置 $NODE_132 指向生产 $PROD_MASTER..."
|
||
redis_cmd "$NODE_132" $REDIS_PORT SLAVEOF "$PROD_MASTER" $PROD_PORT
|
||
sleep 2
|
||
|
||
# 重新配置 133/134 指向 132
|
||
for node in $NODE_133 $NODE_134; do
|
||
log_info "配置 $node 指向 $NODE_132..."
|
||
redis_cmd "$node" $REDIS_PORT SLAVEOF "$NODE_132" $REDIS_PORT
|
||
sleep 1
|
||
done
|
||
|
||
log_warn "不安全回滚完成,请检查数据一致性"
|
||
}
|
||
|
||
# 显示最终状态和连接信息
|
||
show_final_info() {
|
||
echo ""
|
||
echo -e "${BLUE}========================================${NC}"
|
||
echo -e "${BLUE} 切换完成 ${NC}"
|
||
echo -e "${BLUE}========================================${NC}"
|
||
|
||
check_status
|
||
|
||
echo -e "${GREEN}========================================${NC}"
|
||
echo -e "${GREEN} Review 应用连接配置 ${NC}"
|
||
echo -e "${GREEN}========================================${NC}"
|
||
echo ""
|
||
echo "直连模式:"
|
||
echo " host: $NODE_132"
|
||
echo " port: $REDIS_PORT"
|
||
echo " password: $REDIS_PASSWORD"
|
||
echo ""
|
||
echo "Sentinel 模式 (Spring Boot):"
|
||
echo " spring:"
|
||
echo " redis:"
|
||
echo " sentinel:"
|
||
echo " master: reviewmaster"
|
||
echo " nodes: $NODE_132:$SENTINEL_PORT,$NODE_133:$SENTINEL_PORT,$NODE_134:$SENTINEL_PORT"
|
||
echo " password: $SENTINEL_PASSWORD"
|
||
echo " password: $REDIS_PASSWORD"
|
||
echo ""
|
||
}
|
||
|
||
# ==================== 主函数 ====================
|
||
main() {
|
||
# 解析全局参数
|
||
parse_global_args "$@"
|
||
|
||
echo -e "${GREEN}"
|
||
echo "╔════════════════════════════════════════════╗"
|
||
echo "║ Review Redis 故障切换脚本 ║"
|
||
echo "╚════════════════════════════════════════════╝"
|
||
echo -e "${NC}"
|
||
|
||
if [ "$AUTO_CONFIRM" = true ]; then
|
||
log_info "已启用自动确认模式 (-y)"
|
||
fi
|
||
if [ "$FORCE_MODE" = true ]; then
|
||
log_warn "已启用强制模式 (--force),将跳过所有确认"
|
||
fi
|
||
|
||
# 对于需要执行 Redis 命令的操作,检查镜像是否存在
|
||
case "${1:-help}" in
|
||
status|failover|full|rollback|unsafe-rollback)
|
||
check_redis_image || exit 1
|
||
;;
|
||
esac
|
||
|
||
case "${1:-help}" in
|
||
status)
|
||
check_prod_status || true
|
||
check_status
|
||
;;
|
||
failover)
|
||
check_status
|
||
echo ""
|
||
if confirm_prompt "确认执行故障切换(提升 132 为 Master)?"; then
|
||
promote_132
|
||
configure_slaves
|
||
check_status
|
||
else
|
||
echo "已取消"
|
||
fi
|
||
;;
|
||
sentinel)
|
||
deploy_sentinel
|
||
;;
|
||
full)
|
||
check_prod_status || true
|
||
check_status
|
||
echo ""
|
||
if confirm_prompt "确认执行完整故障切换(包含 Sentinel 部署)?"; then
|
||
promote_132
|
||
configure_slaves
|
||
deploy_sentinel
|
||
show_final_info
|
||
else
|
||
echo "已取消"
|
||
fi
|
||
;;
|
||
rollback)
|
||
safe_rollback
|
||
;;
|
||
unsafe-rollback)
|
||
unsafe_rollback
|
||
check_status
|
||
;;
|
||
help|*)
|
||
echo "用法: $0 {status|failover|sentinel|full|rollback|unsafe-rollback} [-y|--yes] [--force]"
|
||
echo ""
|
||
echo "命令说明:"
|
||
echo " status - 查看当前 Redis 状态"
|
||
echo " failover - 执行故障切换(提升 132 为 Master)"
|
||
echo " sentinel - 仅部署 Review Sentinel"
|
||
echo " full - 完整切换(failover + sentinel)"
|
||
echo " rollback - 安全回滚(先同步数据到生产,再切换)"
|
||
echo " unsafe-rollback - 不安全回滚(直接切换,可能丢数据)"
|
||
echo ""
|
||
echo "可选参数:"
|
||
echo " -y, --yes - 自动确认普通提示(危险操作仍需手动确认)"
|
||
echo " --force - 强制模式,跳过所有确认(慎用!)"
|
||
echo ""
|
||
echo "示例:"
|
||
echo " $0 full -y # 自动确认执行完整切换"
|
||
echo " $0 rollback # 交互式安全回滚"
|
||
echo " $0 rollback --force # 强制执行回滚(跳过所有确认)"
|
||
echo ""
|
||
echo "故障切换流程:"
|
||
echo " 1. 执行 ./failover-to-review.sh status 检查状态"
|
||
echo " 2. 执行 ./failover-to-review.sh full 完整切换"
|
||
echo " 3. 修改应用配置,连接 Review Sentinel"
|
||
echo ""
|
||
echo "安全恢复流程(推荐):"
|
||
echo " 1. 确认生产环境 Redis 已恢复"
|
||
echo " 2. 【重要】确认所有应用已切换到 Review Sentinel"
|
||
echo " 3. 【重要】停止生产 Sentinel: docker stack rm prod_sentinel"
|
||
echo " 4. 建议先停止 Review 应用对 Redis 的写入"
|
||
echo " 5. 执行 ./failover-to-review.sh rollback"
|
||
echo " - 步骤1: 让 10.56 作为 132 的从节点(反向同步)"
|
||
echo " - 步骤2: 等待数据完全同步"
|
||
echo " - 步骤3: 将 132 设为只读,提升 10.56 为主节点,恢复原架构"
|
||
echo " 6. 重新启动生产 Sentinel"
|
||
echo " 7. 修改应用配置,连接生产 Sentinel"
|
||
echo ""
|
||
echo "注意:"
|
||
echo " - 所有 Redis 操作通过 Docker 容器执行,需要 $REDIS_IMAGE 镜像"
|
||
echo " - 如果能连接生产 Redis,命令会自动远程执行"
|
||
echo " - 如果无法连接,会提示手动在生产服务器执行"
|
||
echo " - 如果 review_redis_master 容器运行中,优先使用该容器"
|
||
echo " - 否则使用临时容器执行命令"
|
||
;;
|
||
esac
|
||
}
|
||
|
||
main "$@"
|