Resilience4j 数据库熔断-健康检查sql

发布于:2024-08-02 ⋅ 阅读:(37) ⋅ 点赞:(0)

最近项目上开始使用Resilience4j进行数据库熔断

需求要根据数据库用户的情况来进行单用户的熔断,这边给出一个sql提供给Resilience4j进行健康检查结合异常方便判断单库情况

# 遍历所有库名,排除postgres和template
WITH database_list AS (
    SELECT datname
    FROM pg_database
    WHERE datistemplate = false and datname not in ('repmgr','postgres')
),
# 获取连接数,活跃连接数,锁等待数,查询执行时长几个关键参数,由于是主从库都要执行健康检查,所以减少了一些只能在主库执行的参数
health_checks AS (
    SELECT
        d.datname,
        (select 1) AS survive,  -- 数据库存活
        (SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname) AS all_connections, -- 数据库连接数
        (SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname and state = 'active') AS active_connections, -- 数据库活跃连接数
        (SELECT count(*) FROM pg_locks l 
         JOIN pg_stat_activity a ON l.pid = a.pid
         WHERE a.datname = d.datname AND NOT l.granted) AS waiting_locks, -- 锁等待数
        (SELECT max(now() - query_start) FROM pg_stat_activity WHERE datname = d.datname AND state = 'active' and usename <> 'repmgr' and application_name <> 'Debezium Streaming') AS max_query_duration -- 查询时长
    FROM database_list d
)
# 对该库的各项指标进行一个判断,返回不健康的库,这里只是一个模版,具体的数值和逻辑需要根据实际情况改写
SELECT
    datname AS database_name,
    survive,
    active_connections,
    all_connections,
    waiting_locks,
    max_query_duration,
    CASE
        WHEN survive <> 1 THEN 'Unhealthy'
        WHEN all_connections > 200 THEN 'Unhealthy'
        WHEN active_connections > 20 THEN 'Unhealthy'
        WHEN waiting_locks > 10 THEN 'Unhealthy'
        WHEN max_query_duration > interval '60 seconds' THEN 'Unhealthy'
        ELSE 'Healthy'
    END AS health_status
FROM health_checks
WHERE
    survive <> 1
    OR all_connections > 200
    OR active_connections > 20
    OR waiting_locks > 10
    OR max_query_duration > interval '60 seconds';