最近项目上开始使用Resilience4j进行数据库熔断
需求要根据数据库用户的情况来进行单用户的熔断,这边给出一个sql提供给Resilience4j进行健康检查结合异常方便判断单库情况
# 遍历所有库名,排除postgres和template
WITH database_list AS (
SELECT datname
FROM pg_database
WHERE datistemplate = false and datname not in ('repmgr','postgres')
),
# 获取连接数,活跃连接数,锁等待数,查询执行时长几个关键参数,由于是主从库都要执行健康检查,所以减少了一些只能在主库执行的参数
health_checks AS (
SELECT
d.datname,
(select 1) AS survive, -- 数据库存活
(SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname) AS all_connections, -- 数据库连接数
(SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname and state = 'active') AS active_connections, -- 数据库活跃连接数
(SELECT count(*) FROM pg_locks l
JOIN pg_stat_activity a ON l.pid = a.pid
WHERE a.datname = d.datname AND NOT l.granted) AS waiting_locks, -- 锁等待数
(SELECT max(now() - query_start) FROM pg_stat_activity WHERE datname = d.datname AND state = 'active' and usename <> 'repmgr' and application_name <> 'Debezium Streaming') AS max_query_duration -- 查询时长
FROM database_list d
)
# 对该库的各项指标进行一个判断,返回不健康的库,这里只是一个模版,具体的数值和逻辑需要根据实际情况改写
SELECT
datname AS database_name,
survive,
active_connections,
all_connections,
waiting_locks,
max_query_duration,
CASE
WHEN survive <> 1 THEN 'Unhealthy'
WHEN all_connections > 200 THEN 'Unhealthy'
WHEN active_connections > 20 THEN 'Unhealthy'
WHEN waiting_locks > 10 THEN 'Unhealthy'
WHEN max_query_duration > interval '60 seconds' THEN 'Unhealthy'
ELSE 'Healthy'
END AS health_status
FROM health_checks
WHERE
survive <> 1
OR all_connections > 200
OR active_connections > 20
OR waiting_locks > 10
OR max_query_duration > interval '60 seconds';