MySQL优化-监控与分析
大约 8 分钟
MySQL优化-监控与分析
业务场景引入
在生产环境中,MySQL数据库监控至关重要:
- 性能监控:实时掌握数据库响应时间和吞吐量
- 资源监控:CPU、内存、磁盘IO使用情况
- 业务监控:订单处理延迟、用户登录成功率
- 故障预警:提前发现潜在问题,避免业务中断
完善的监控体系是数据库稳定运行的保障。
Performance Schema深度应用
核心监控表详解
-- 1. 语句执行统计
SELECT
schema_name,
digest_text,
count_star as exec_count,
ROUND(avg_timer_wait/1000000000, 3) as avg_time_sec,
ROUND(max_timer_wait/1000000000, 3) as max_time_sec,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec,
avg_rows_examined,
avg_rows_sent
FROM performance_schema.events_statements_summary_by_digest
WHERE schema_name = 'ecommerce'
ORDER BY avg_timer_wait DESC
LIMIT 20;
-- 2. 表级别IO统计
SELECT
object_schema,
object_name,
count_read,
count_write,
count_fetch,
count_insert,
count_update,
count_delete,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec
FROM performance_schema.table_io_waits_summary_by_table
WHERE object_schema = 'ecommerce'
ORDER BY sum_timer_wait DESC;
-- 3. 索引使用统计
SELECT
object_schema,
object_name,
index_name,
count_fetch,
count_insert,
count_update,
count_delete,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec
FROM performance_schema.table_io_waits_summary_by_index_usage
WHERE object_schema = 'ecommerce'
AND count_fetch > 0
ORDER BY count_fetch DESC;
等待事件分析
-- 文件IO等待分析
SELECT
event_name,
count_star,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec,
ROUND(avg_timer_wait/1000000000, 3) as avg_time_sec,
ROUND(max_timer_wait/1000000000, 3) as max_time_sec
FROM performance_schema.events_waits_summary_global_by_event_name
WHERE event_name LIKE 'wait/io/file/%'
ORDER BY sum_timer_wait DESC
LIMIT 15;
-- 锁等待分析
SELECT
event_name,
count_star,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec,
ROUND(avg_timer_wait/1000000000, 3) as avg_time_sec
FROM performance_schema.events_waits_summary_global_by_event_name
WHERE event_name LIKE 'wait/synch/%'
AND count_star > 0
ORDER BY avg_timer_wait DESC
LIMIT 10;
-- 网络IO等待分析
SELECT
event_name,
count_star,
ROUND(sum_timer_wait/1000000000, 3) as total_time_sec,
ROUND(avg_timer_wait/1000000000, 3) as avg_time_sec
FROM performance_schema.events_waits_summary_global_by_event_name
WHERE event_name LIKE 'wait/io/socket/%'
ORDER BY sum_timer_wait DESC;
内存使用分析
-- 内存使用统计
SELECT
event_name,
current_count_used,
current_size_allocated / 1024 / 1024 as current_mb,
high_count_used,
high_size_allocated / 1024 / 1024 as high_mb
FROM performance_schema.memory_summary_global_by_event_name
WHERE current_size_allocated > 0
ORDER BY current_size_allocated DESC
LIMIT 20;
-- 用户连接内存使用
SELECT
user,
current_connections,
total_connections,
ROUND(SUM(current_allocated)/1024/1024, 2) as current_allocated_mb,
ROUND(SUM(total_allocated)/1024/1024, 2) as total_allocated_mb
FROM performance_schema.memory_summary_by_user_by_event_name
GROUP BY user
ORDER BY SUM(current_allocated) DESC;
系统级监控
服务器资源监控
-- CPU使用率监控
SELECT
ROUND(SUM(TIMER_WAIT)/1000000000, 2) as cpu_time_seconds,
COUNT(*) as statement_count
FROM performance_schema.events_statements_history_long
WHERE TIMER_START > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 1 HOUR)) * 1000000000;
-- 磁盘IO监控
SELECT
file_name,
event_name,
ROUND(SUM(sum_timer_wait)/1000000000, 3) as total_io_time_sec,
SUM(count_star) as total_io_ops,
ROUND(AVG(avg_timer_wait)/1000000000, 6) as avg_io_time_sec
FROM performance_schema.file_summary_by_event_name
WHERE event_name IN ('wait/io/file/sql/binlog', 'wait/io/file/innodb/innodb_data_file')
GROUP BY file_name, event_name
ORDER BY total_io_time_sec DESC;
-- 网络流量监控
SELECT
VARIABLE_NAME,
VARIABLE_VALUE,
CASE VARIABLE_NAME
WHEN 'Bytes_received' THEN ROUND(VARIABLE_VALUE / 1024 / 1024, 2)
WHEN 'Bytes_sent' THEN ROUND(VARIABLE_VALUE / 1024 / 1024, 2)
ELSE VARIABLE_VALUE
END as value_mb
FROM performance_schema.global_status
WHERE VARIABLE_NAME IN ('Bytes_received', 'Bytes_sent', 'Connections', 'Aborted_connects');
InnoDB监控指标
-- InnoDB缓冲池监控
SELECT
'Buffer Pool Size' as metric,
ROUND(@@innodb_buffer_pool_size / 1024 / 1024 / 1024, 2) as value_gb
UNION ALL
SELECT
'Buffer Pool Hit Ratio',
ROUND((1 - (
SELECT VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads'
) / (
SELECT VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests'
)) * 100, 2)
UNION ALL
SELECT
'Buffer Pool Pages Free',
(SELECT VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_free')
UNION ALL
SELECT
'Buffer Pool Pages Dirty',
(SELECT VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_dirty');
-- InnoDB锁监控
SELECT
'Lock Waits' as metric,
VARIABLE_VALUE as current_value
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_row_lock_waits'
UNION ALL
SELECT
'Lock Wait Time (seconds)',
ROUND(VARIABLE_VALUE / 1000, 2)
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_row_lock_time'
UNION ALL
SELECT
'Deadlocks',
VARIABLE_VALUE
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_deadlocks';
-- InnoDB日志监控
SELECT
'Log Waits' as metric,
VARIABLE_VALUE
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_log_waits'
UNION ALL
SELECT
'Log Writes',
VARIABLE_VALUE
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_log_writes'
UNION ALL
SELECT
'Log Write Requests',
VARIABLE_VALUE
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_log_write_requests';
业务监控指标
关键业务指标监控
-- 创建业务监控视图
CREATE VIEW business_metrics AS
SELECT
'Daily Orders' as metric,
COUNT(*) as value,
CURDATE() as metric_date
FROM orders
WHERE DATE(order_date) = CURDATE()
UNION ALL
SELECT
'Daily Revenue',
ROUND(SUM(total_amount), 2),
CURDATE()
FROM orders
WHERE DATE(order_date) = CURDATE()
AND order_status IN ('PAID', 'SHIPPED', 'DELIVERED')
UNION ALL
SELECT
'Active Users Today',
COUNT(DISTINCT user_id),
CURDATE()
FROM user_activities
WHERE DATE(created_at) = CURDATE()
UNION ALL
SELECT
'Average Order Value',
ROUND(AVG(total_amount), 2),
CURDATE()
FROM orders
WHERE DATE(order_date) = CURDATE()
UNION ALL
SELECT
'Payment Success Rate (%)',
ROUND(
SUM(CASE WHEN order_status != 'PAYMENT_FAILED' THEN 1 ELSE 0 END) * 100.0 / COUNT(*),
2
),
CURDATE()
FROM orders
WHERE DATE(order_date) = CURDATE();
-- 查看业务指标
SELECT * FROM business_metrics;
异常检测监控
-- 异常订单监控
SELECT
'Large Orders (>10000)' as alert_type,
COUNT(*) as count,
GROUP_CONCAT(order_id) as order_ids
FROM orders
WHERE DATE(order_date) = CURDATE()
AND total_amount > 10000
UNION ALL
SELECT
'Failed Payments Today',
COUNT(*),
GROUP_CONCAT(order_id)
FROM orders
WHERE DATE(order_date) = CURDATE()
AND order_status = 'PAYMENT_FAILED'
UNION ALL
SELECT
'Cancelled Orders (>5%)',
COUNT(*),
CASE
WHEN COUNT(*) > (SELECT COUNT(*) * 0.05 FROM orders WHERE DATE(order_date) = CURDATE())
THEN 'ALERT'
ELSE 'OK'
END
FROM orders
WHERE DATE(order_date) = CURDATE()
AND order_status = 'CANCELLED';
-- 库存预警监控
SELECT
'Low Stock Products' as alert_type,
COUNT(*) as count,
GROUP_CONCAT(CONCAT(product_name, '(', stock_quantity, ')') SEPARATOR ', ') as details
FROM products
WHERE stock_quantity <= 10
AND status = 'ACTIVE';
自动化监控系统
Python监控脚本
import mysql.connector
import time
import json
import smtplib
from email.mime.text import MIMEText
from datetime import datetime, timedelta
import logging
class MySQLMonitor:
def __init__(self, db_config, alert_config):
self.db_config = db_config
self.alert_config = alert_config
self.setup_logging()
def setup_logging(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/var/log/mysql_monitor.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def get_connection(self):
"""获取数据库连接"""
return mysql.connector.connect(**self.db_config)
def check_performance_metrics(self):
"""检查性能指标"""
conn = self.get_connection()
cursor = conn.cursor(dictionary=True)
alerts = []
# 检查缓冲池命中率
cursor.execute("""
SELECT
ROUND((1 - (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads'
) / (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests'
)) * 100, 2) as hit_ratio
""")
hit_ratio = cursor.fetchone()['hit_ratio']
if hit_ratio < 95:
alerts.append({
'type': 'PERFORMANCE',
'severity': 'WARNING',
'message': f'InnoDB Buffer Pool hit ratio is low: {hit_ratio}%'
})
# 检查连接数
cursor.execute("SHOW STATUS LIKE 'Threads_connected'")
current_connections = int(cursor.fetchone()['Value'])
cursor.execute("SHOW VARIABLES LIKE 'max_connections'")
max_connections = int(cursor.fetchone()['Value'])
connection_usage = (current_connections / max_connections) * 100
if connection_usage > 80:
alerts.append({
'type': 'CONNECTIONS',
'severity': 'CRITICAL',
'message': f'High connection usage: {connection_usage:.1f}% ({current_connections}/{max_connections})'
})
# 检查慢查询
cursor.execute("""
SELECT COUNT(*) as slow_queries
FROM performance_schema.events_statements_summary_by_digest
WHERE avg_timer_wait > 5000000000 -- 超过5秒
""")
slow_queries = cursor.fetchone()['slow_queries']
if slow_queries > 10:
alerts.append({
'type': 'SLOW_QUERIES',
'severity': 'WARNING',
'message': f'Found {slow_queries} slow query patterns'
})
cursor.close()
conn.close()
return alerts
def check_business_metrics(self):
"""检查业务指标"""
conn = self.get_connection()
cursor = conn.cursor(dictionary=True)
alerts = []
# 检查今日订单数
cursor.execute("""
SELECT COUNT(*) as today_orders
FROM orders
WHERE DATE(order_date) = CURDATE()
""")
today_orders = cursor.fetchone()['today_orders']
# 检查昨日订单数作为基准
cursor.execute("""
SELECT COUNT(*) as yesterday_orders
FROM orders
WHERE DATE(order_date) = DATE_SUB(CURDATE(), INTERVAL 1 DAY)
""")
yesterday_orders = cursor.fetchone()['yesterday_orders']
if yesterday_orders > 0:
order_change = ((today_orders - yesterday_orders) / yesterday_orders) * 100
if order_change < -20: # 订单数下降超过20%
alerts.append({
'type': 'BUSINESS',
'severity': 'WARNING',
'message': f'Order count dropped by {abs(order_change):.1f}%: {today_orders} vs {yesterday_orders}'
})
# 检查支付失败率
cursor.execute("""
SELECT
COUNT(*) as total_orders,
SUM(CASE WHEN order_status = 'PAYMENT_FAILED' THEN 1 ELSE 0 END) as failed_payments
FROM orders
WHERE DATE(order_date) = CURDATE()
""")
result = cursor.fetchone()
if result['total_orders'] > 0:
failure_rate = (result['failed_payments'] / result['total_orders']) * 100
if failure_rate > 5: # 支付失败率超过5%
alerts.append({
'type': 'BUSINESS',
'severity': 'CRITICAL',
'message': f'High payment failure rate: {failure_rate:.1f}%'
})
cursor.close()
conn.close()
return alerts
def check_system_health(self):
"""检查系统健康状态"""
conn = self.get_connection()
cursor = conn.cursor(dictionary=True)
alerts = []
# 检查表空间使用率
cursor.execute("""
SELECT
table_schema,
ROUND(SUM(data_length + index_length) / 1024 / 1024 / 1024, 2) as size_gb
FROM information_schema.tables
WHERE table_schema = 'ecommerce'
GROUP BY table_schema
""")
result = cursor.fetchone()
if result and result['size_gb'] > 100: # 数据库大小超过100GB
alerts.append({
'type': 'STORAGE',
'severity': 'INFO',
'message': f'Database size: {result["size_gb"]}GB'
})
# 检查死锁情况
cursor.execute("SHOW STATUS LIKE 'Innodb_deadlocks'")
deadlocks = int(cursor.fetchone()['Value'])
# 这里需要与之前的值比较,简化处理
if deadlocks > 0:
alerts.append({
'type': 'DEADLOCKS',
'severity': 'WARNING',
'message': f'Deadlocks detected: {deadlocks}'
})
cursor.close()
conn.close()
return alerts
def send_alert(self, alerts):
"""发送告警邮件"""
if not alerts:
return
critical_alerts = [a for a in alerts if a['severity'] == 'CRITICAL']
warning_alerts = [a for a in alerts if a['severity'] == 'WARNING']
if critical_alerts or len(warning_alerts) > 3:
subject = f"MySQL Alert - {len(critical_alerts)} Critical, {len(warning_alerts)} Warning"
body = "MySQL Monitoring Alert Report\n"
body += f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
if critical_alerts:
body += "CRITICAL ALERTS:\n"
for alert in critical_alerts:
body += f"- {alert['type']}: {alert['message']}\n"
body += "\n"
if warning_alerts:
body += "WARNING ALERTS:\n"
for alert in warning_alerts:
body += f"- {alert['type']}: {alert['message']}\n"
# 发送邮件(需要配置SMTP服务器)
# self._send_email(subject, body)
# 记录到日志
self.logger.warning(f"Alerts generated: {subject}")
def run_monitoring_cycle(self):
"""执行一轮监控检查"""
try:
self.logger.info("Starting monitoring cycle")
all_alerts = []
all_alerts.extend(self.check_performance_metrics())
all_alerts.extend(self.check_business_metrics())
all_alerts.extend(self.check_system_health())
if all_alerts:
self.send_alert(all_alerts)
for alert in all_alerts:
self.logger.warning(f"{alert['severity']}: {alert['message']}")
else:
self.logger.info("All checks passed")
except Exception as e:
self.logger.error(f"Monitoring cycle failed: {e}")
# 使用示例
if __name__ == "__main__":
db_config = {
'host': 'localhost',
'user': 'monitor_user',
'password': 'password',
'database': 'ecommerce'
}
alert_config = {
'smtp_server': 'smtp.company.com',
'smtp_port': 587,
'email_user': 'monitor@company.com',
'email_password': 'password',
'recipients': ['admin@company.com', 'dba@company.com']
}
monitor = MySQLMonitor(db_config, alert_config)
# 运行单次检查
monitor.run_monitoring_cycle()
# 或者持续监控(每5分钟检查一次)
# while True:
# monitor.run_monitoring_cycle()
# time.sleep(300) # 5分钟
监控仪表板
-- 创建监控仪表板视图
CREATE VIEW monitoring_dashboard AS
-- 系统性能指标
SELECT
'System Performance' as category,
'Buffer Pool Hit Ratio (%)' as metric,
ROUND((1 - (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads'
) / (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests'
)) * 100, 2) as value,
CASE
WHEN ROUND((1 - (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads'
) / (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests'
)) * 100, 2) >= 95 THEN 'GOOD'
WHEN ROUND((1 - (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads'
) / (
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests'
)) * 100, 2) >= 90 THEN 'WARNING'
ELSE 'CRITICAL'
END as status
UNION ALL
-- 连接使用率
SELECT
'System Performance',
'Connection Usage (%)',
ROUND((
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Threads_connected'
) * 100.0 / (
SELECT VARIABLE_VALUE FROM performance_schema.global_variables
WHERE VARIABLE_NAME = 'max_connections'
), 2),
CASE
WHEN ROUND((
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Threads_connected'
) * 100.0 / (
SELECT VARIABLE_VALUE FROM performance_schema.global_variables
WHERE VARIABLE_NAME = 'max_connections'
), 2) < 70 THEN 'GOOD'
WHEN ROUND((
SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Threads_connected'
) * 100.0 / (
SELECT VARIABLE_VALUE FROM performance_schema.global_variables
WHERE VARIABLE_NAME = 'max_connections'
), 2) < 85 THEN 'WARNING'
ELSE 'CRITICAL'
END
UNION ALL
-- 业务指标
SELECT
'Business Metrics',
'Orders Today',
COUNT(*),
'INFO'
FROM orders
WHERE DATE(order_date) = CURDATE()
UNION ALL
SELECT
'Business Metrics',
'Revenue Today',
ROUND(SUM(total_amount), 2),
'INFO'
FROM orders
WHERE DATE(order_date) = CURDATE()
AND order_status IN ('PAID', 'SHIPPED', 'DELIVERED');
-- 查看仪表板
SELECT * FROM monitoring_dashboard ORDER BY category, metric;
Grafana集成脚本
import requests
import json
from datetime import datetime
class GrafanaIntegration:
def __init__(self, grafana_url, api_key):
self.grafana_url = grafana_url
self.headers = {
'Authorization': f'Bearer {api_key}',
'Content-Type': 'application/json'
}
def send_metrics(self, metrics):
"""发送指标到Grafana"""
timestamp = int(datetime.now().timestamp() * 1000)
for metric in metrics:
payload = {
'dashboard': 'mysql-monitoring',
'panelId': metric.get('panel_id', 1),
'time': timestamp,
'value': metric['value'],
'text': metric.get('text', ''),
'tags': metric.get('tags', [])
}
try:
response = requests.post(
f"{self.grafana_url}/api/annotations",
headers=self.headers,
data=json.dumps(payload)
)
response.raise_for_status()
except Exception as e:
print(f"Failed to send metric: {e}")
# 使用示例
grafana = GrafanaIntegration('http://grafana:3000', 'your_api_key')
metrics = [
{
'panel_id': 1,
'value': 95.5,
'text': 'Buffer pool hit ratio',
'tags': ['mysql', 'performance']
}
]
grafana.send_metrics(metrics)
告警规则配置
告警阈值设定
-- 创建告警规则配置表
CREATE TABLE alert_rules (
rule_id INT PRIMARY KEY AUTO_INCREMENT,
rule_name VARCHAR(100) NOT NULL,
metric_name VARCHAR(100) NOT NULL,
threshold_value DECIMAL(10,2) NOT NULL,
comparison_operator ENUM('>', '<', '>=', '<=', '=', '!=') NOT NULL,
severity ENUM('INFO', 'WARNING', 'CRITICAL') NOT NULL,
is_enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 插入默认告警规则
INSERT INTO alert_rules (rule_name, metric_name, threshold_value, comparison_operator, severity) VALUES
('Buffer Pool Hit Ratio Low', 'buffer_pool_hit_ratio', 95.00, '<', 'WARNING'),
('High Connection Usage', 'connection_usage_percent', 80.00, '>', 'CRITICAL'),
('Slow Query Count High', 'slow_query_count', 10, '>', 'WARNING'),
('Payment Failure Rate High', 'payment_failure_rate', 5.00, '>', 'CRITICAL'),
('Low Stock Alert', 'low_stock_products', 0, '>', 'WARNING');
-- 查看告警规则
SELECT * FROM alert_rules WHERE is_enabled = TRUE;
MySQL监控与分析是确保数据库稳定运行的关键环节,通过完善的监控体系,可以及时发现和解决潜在问题,保障业务的连续性。