MySQL运维-高可用架构

清峰大约 6 分钟

MySQL运维-高可用架构

业务场景引入

高可用架构是保障业务连续性的关键：

7×24服务：电商平台需要全天候不间断服务
故障容错：单点故障不能影响整体业务
数据一致性：确保数据在多个节点间保持一致
自动故障切换：故障发生时快速自动恢复

高可用架构方案

架构演进路径

可用性级别对比

架构方案	可用性	RTO	RPO	复杂度
主从复制	99.9%	5-30分钟	分钟级	低
MHA	99.95%	30-60秒	秒级	中
MGR	99.99%	10-30秒	0	高

主从复制架构

基础配置

-- 主库配置 (my.cnf)
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog-format = ROW
gtid-mode = ON
enforce-gtid-consistency = ON

-- 从库配置 (my.cnf)
[mysqld]
server-id = 2
read_only = 1
super_read_only = 1
relay_log_recovery = 1

主从搭建脚本

#!/bin/bash
# MySQL主从复制部署

MASTER_HOST="192.168.1.10"
SLAVE_HOST="192.168.1.11"
REPL_USER="repl"
REPL_PASS="password"

# 主库创建复制用户
mysql -h${MASTER_HOST} -uroot -p << EOF
CREATE USER '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}';
GRANT REPLICATION SLAVE ON *.* TO '${REPL_USER}'@'%';
FLUSH PRIVILEGES;
EOF

# 获取主库状态
MASTER_STATUS=$(mysql -h${MASTER_HOST} -uroot -p -e "SHOW MASTER STATUS\G")
MASTER_FILE=$(echo "$MASTER_STATUS" | grep "File:" | awk '{print $2}')
MASTER_POS=$(echo "$MASTER_STATUS" | grep "Position:" | awk '{print $2}')

# 配置从库
mysql -h${SLAVE_HOST} -uroot -p << EOF
CHANGE MASTER TO 
  MASTER_HOST='${MASTER_HOST}',
  MASTER_USER='${REPL_USER}',
  MASTER_PASSWORD='${REPL_PASS}',
  MASTER_LOG_FILE='${MASTER_FILE}',
  MASTER_LOG_POS=${MASTER_POS};
START SLAVE;
EOF

echo "主从复制配置完成"

复制监控

-- 检查复制状态
SHOW SLAVE STATUS\G

-- 关键指标监控
SELECT 
    'Slave_IO_Running' as metric,
    IF(Slave_IO_Running='Yes', 'OK', 'ERROR') as status
FROM (SHOW SLAVE STATUS) s
UNION ALL
SELECT 
    'Slave_SQL_Running',
    IF(Slave_SQL_Running='Yes', 'OK', 'ERROR')
FROM (SHOW SLAVE STATUS) s
UNION ALL  
SELECT 
    'Seconds_Behind_Master',
    CASE 
        WHEN Seconds_Behind_Master IS NULL THEN 'ERROR'
        WHEN Seconds_Behind_Master > 60 THEN 'WARNING'
        ELSE 'OK'
    END
FROM (SHOW SLAVE STATUS) s;

MHA架构实施

MHA环境部署

#!/bin/bash
# MHA Manager部署脚本

# 安装MHA Manager
yum install -y perl-DBD-MySQL perl-Config-Tiny perl-Log-Dispatch
wget https://github.com/yoshinorim/mha4mysql-manager/releases/download/v0.58/mha4mysql-manager-0.58.tar.gz
tar xf mha4mysql-manager-0.58.tar.gz
cd mha4mysql-manager-0.58
perl Makefile.PL && make && make install

# 配置MHA
mkdir -p /etc/mha
mkdir -p /var/log/mha

# MHA配置文件
cat > /etc/mha/mysql_cluster.conf << EOF
[server default]
manager_log=/var/log/mha/mysql_cluster.log
manager_workdir=/var/log/mha
master_binlog_dir=/var/lib/mysql
user=mha
password=mha_password
ping_interval=1
repl_user=repl
repl_password=repl_password
ssh_user=root

[server1]
hostname=192.168.1.10
port=3306
candidate_master=1

[server2] 
hostname=192.168.1.11
port=3306
candidate_master=1

[server3]
hostname=192.168.1.12
port=3306
no_master=1
EOF

MHA监控脚本

#!/bin/bash
# MHA状态监控

MHA_CONF="/etc/mha/mysql_cluster.conf"
MHA_LOG="/var/log/mha/mysql_cluster.log"

# 检查MHA Manager状态
check_mha_status() {
    masterha_check_status --conf=${MHA_CONF}
    return $?
}

# 检查复制状态
check_replication() {
    masterha_check_repl --conf=${MHA_CONF}
    return $?
}

# 启动MHA Manager
start_mha() {
    if ! check_mha_status; then
        echo "启动MHA Manager..."
        nohup masterha_manager --conf=${MHA_CONF} > ${MHA_LOG} 2>&1 &
        sleep 5
        
        if check_mha_status; then
            echo "MHA Manager启动成功"
        else
            echo "MHA Manager启动失败"
        fi
    else
        echo "MHA Manager已经运行"
    fi
}

# 主函数
case "$1" in
    status)
        check_mha_status
        ;;
    check)
        check_replication
        ;;
    start)
        start_mha
        ;;
    *)
        echo "Usage: $0 {status|check|start}"
        ;;
esac

MGR集群架构

MGR配置部署

-- MGR配置 (my.cnf)
[mysqld]
server-id = 1
gtid-mode = ON
enforce-gtid-consistency = ON
binlog-checksum = NONE
log-slave-updates = ON
log-bin = binlog
binlog-format = ROW

# MGR配置
plugin-load-add = group_replication.so
group_replication_group_name = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
group_replication_start_on_boot = OFF
group_replication_local_address = "192.168.1.10:33061"
group_replication_group_seeds = "192.168.1.10:33061,192.168.1.11:33061,192.168.1.12:33061"

-- 初始化MGR
-- 在第一个节点执行
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION;
SET GLOBAL group_replication_bootstrap_group=OFF;

-- 在其他节点执行
START GROUP_REPLICATION;

MGR管理脚本

#!/usr/bin/env python3
import mysql.connector
import time
import logging

class MGRManager:
    def __init__(self, hosts):
        self.hosts = hosts
        self.setup_logging()
    
    def setup_logging(self):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def get_connection(self, host):
        return mysql.connector.connect(
            host=host,
            user='admin',
            password='password',
            port=3306
        )
    
    def check_mgr_status(self):
        """检查MGR集群状态"""
        status_info = {}
        
        for host in self.hosts:
            try:
                conn = self.get_connection(host)
                cursor = conn.cursor(dictionary=True)
                
                # 检查节点状态
                cursor.execute("""
                    SELECT MEMBER_HOST, MEMBER_PORT, MEMBER_STATE, MEMBER_ROLE
                    FROM performance_schema.replication_group_members
                """)
                members = cursor.fetchall()
                
                # 检查是否有PRIMARY
                cursor.execute("""
                    SELECT COUNT(*) as primary_count
                    FROM performance_schema.replication_group_members
                    WHERE MEMBER_ROLE = 'PRIMARY'
                """)
                primary_count = cursor.fetchone()['primary_count']
                
                status_info[host] = {
                    'members': members,
                    'primary_count': primary_count,
                    'status': 'ONLINE' if primary_count > 0 else 'ERROR'
                }
                
                cursor.close()
                conn.close()
                
            except Exception as e:
                status_info[host] = {'status': 'ERROR', 'error': str(e)}
        
        return status_info
    
    def bootstrap_cluster(self, bootstrap_node):
        """引导MGR集群"""
        try:
            conn = self.get_connection(bootstrap_node)
            cursor = conn.cursor()
            
            # 引导集群
            cursor.execute("SET GLOBAL group_replication_bootstrap_group=ON")
            cursor.execute("START GROUP_REPLICATION")
            cursor.execute("SET GLOBAL group_replication_bootstrap_group=OFF")
            
            self.logger.info(f"MGR cluster bootstrapped on {bootstrap_node}")
            
            cursor.close()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Bootstrap failed: {e}")
    
    def join_cluster(self, node):
        """节点加入集群"""
        try:
            conn = self.get_connection(node)
            cursor = conn.cursor()
            
            cursor.execute("START GROUP_REPLICATION")
            
            # 等待节点加入
            for _ in range(30):
                cursor.execute("""
                    SELECT MEMBER_STATE FROM performance_schema.replication_group_members
                    WHERE MEMBER_HOST = %s
                """, (node,))
                result = cursor.fetchone()
                
                if result and result[0] == 'ONLINE':
                    self.logger.info(f"Node {node} joined cluster successfully")
                    break
                time.sleep(1)
            
            cursor.close()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Join cluster failed: {e}")

# 使用示例
mgr = MGRManager(['192.168.1.10', '192.168.1.11', '192.168.1.12'])
status = mgr.check_mgr_status()
print(status)

ProxySQL读写分离

ProxySQL配置

-- ProxySQL管理配置
INSERT INTO mysql_servers(hostgroup_id, hostname, port, weight) VALUES
(0, '192.168.1.10', 3306, 1000),  -- 写组
(1, '192.168.1.11', 3306, 900),   -- 读组
(1, '192.168.1.12', 3306, 900);   -- 读组

-- 用户配置
INSERT INTO mysql_users(username, password, default_hostgroup) VALUES
('app_user', 'app_password', 0);

-- 路由规则
INSERT INTO mysql_query_rules(rule_id, match_pattern, destination_hostgroup, apply) VALUES
(1, '^SELECT.*', 1, 1),
(2, '^INSERT.*', 0, 1),
(3, '^UPDATE.*', 0, 1),
(4, '^DELETE.*', 0, 1);

-- 应用配置
LOAD MYSQL SERVERS TO RUNTIME;
LOAD MYSQL USERS TO RUNTIME;  
LOAD MYSQL QUERY RULES TO RUNTIME;

SAVE MYSQL SERVERS TO DISK;
SAVE MYSQL USERS TO DISK;
SAVE MYSQL QUERY RULES TO DISK;

健康检查脚本

#!/bin/bash
# MySQL高可用健康检查

MYSQL_HOSTS=("192.168.1.10" "192.168.1.11" "192.168.1.12")
MYSQL_USER="monitor"
MYSQL_PASS="monitor_pass"

check_mysql_status() {
    local host=$1
    
    # 检查MySQL连接
    mysql -h${host} -u${MYSQL_USER} -p${MYSQL_PASS} \
          -e "SELECT 1" &>/dev/null
    
    if [ $? -eq 0 ]; then
        echo "MySQL ${host}: OK"
        
        # 检查复制延迟
        if [ "$host" != "${MYSQL_HOSTS[0]}" ]; then
            delay=$(mysql -h${host} -u${MYSQL_USER} -p${MYSQL_PASS} \
                   -e "SHOW SLAVE STATUS\G" | grep "Seconds_Behind_Master" | awk '{print $2}')
            
            if [ "$delay" != "NULL" ] && [ "$delay" -gt 60 ]; then
                echo "WARNING: Replication delay on ${host}: ${delay}s"
            fi
        fi
    else
        echo "ERROR: MySQL ${host} connection failed"
    fi
}

# 检查所有MySQL实例
for host in "${MYSQL_HOSTS[@]}"; do
    check_mysql_status $host
done

# 检查ProxySQL
mysql -h127.0.0.1 -P6032 -uadmin -padmin \
      -e "SELECT hostgroup_id, hostname, status FROM runtime_mysql_servers;" \
      2>/dev/null | grep -E "(ONLINE|OFFLINE)"

故障切换实战

自动故障切换

#!/usr/bin/env python3
import mysql.connector
import time
import subprocess

class FailoverManager:
    def __init__(self, config):
        self.config = config
        self.master = config['master']
        self.slaves = config['slaves']
        self.vip = config['vip']
    
    def check_master_health(self):
        """检查主库健康状态"""
        try:
            conn = mysql.connector.connect(
                host=self.master,
                user='monitor',
                password='password',
                connection_timeout=5
            )
            conn.close()
            return True
        except:
            return False
    
    def promote_slave(self, slave_host):
        """提升从库为主库"""
        try:
            conn = mysql.connector.connect(
                host=slave_host,
                user='admin', 
                password='password'
            )
            cursor = conn.cursor()
            
            # 停止复制
            cursor.execute("STOP SLAVE")
            
            # 设置为可写
            cursor.execute("SET GLOBAL read_only = 0")
            cursor.execute("SET GLOBAL super_read_only = 0")
            
            # 切换VIP
            self.switch_vip(slave_host)
            
            print(f"Promoted {slave_host} to master")
            return True
            
        except Exception as e:
            print(f"Failed to promote {slave_host}: {e}")
            return False
    
    def switch_vip(self, new_master):
        """切换VIP到新主库"""
        # 这里实现VIP切换逻辑
        cmd = f"ip addr add {self.vip}/24 dev eth0"
        subprocess.run(cmd.split())
    
    def monitor_and_failover(self):
        """监控并执行故障切换"""
        failed_checks = 0
        
        while True:
            if not self.check_master_health():
                failed_checks += 1
                if failed_checks >= 3:  # 连续3次失败才切换
                    print(f"Master {self.master} failed, starting failover...")
                    
                    # 选择最适合的从库
                    for slave in self.slaves:
                        if self.promote_slave(slave):
                            break
                    break
            else:
                failed_checks = 0
            
            time.sleep(10)

# 配置示例
config = {
    'master': '192.168.1.10',
    'slaves': ['192.168.1.11', '192.168.1.12'],
    'vip': '192.168.1.100'
}

failover = FailoverManager(config)
failover.monitor_and_failover()

总结与最佳实践

高可用架构选择

小型应用：主从复制 + 手动切换
中型应用：MHA + 自动切换
大型应用：MGR + ProxySQL
超大规模：分布式架构

关键要素

监控告警：实时监控集群状态
自动切换：减少故障恢复时间
数据一致性：确保数据不丢失
测试验证：定期进行故障演练

高可用架构需要根据业务需求选择合适方案，并建立完善的监控和运维体系。