均采用docker-compose部署Prometheus基础环境
部署参考https://blog.csdn.net/ht9999i/article/details/134733793?spm=1001.2014.3001.5501 开头
一.部署Prometheus
# cat docker-compose.yml
version: "3"
services:
prometheus:
#image: prom/prometheus:v2.0.0
image: prom/prometheus:v2.18.1
container_name: prometheus
command: --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --storage.tsdb.retention=30d
user: 0:0
ports:
- 9090:9090
volumes:
- /data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/alerts/:/etc/prometheus/rules.d/
- /data/prometheus/data/:/prometheus
- /etc/localtime:/etc/localtime:ro
准备数据目录
# cat /data/prometheus/prometheus.yml
global:
scrape_interval: 120s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 120s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_timeout: 50s
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'tencent-prom-stack'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.6.219.71:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- 'rules.d/*'
# - "second_rules.yml"
scrape_configs:
- job_name: 'prom-stack'
static_configs:
- targets:
- prometheus:9090
##Node设备
- job_name: 'idc-node-linux'
scrape_interval: 60s
metrics_path: /metrics #获取指标的url
static_configs:
- targets: ["10.x.x.x:9100","10.x.x.x:9100","10.x.x.x:9100","10.x.x.x:9100"]
##负载均衡
- job_name: LB-NETWORK
static_configs:
- targets:
- 10.x.x.x ##网络设备地址
metrics_path: /snmp
params:
module:
- if_mib # 如果是其他设备,请更换其他模块。
#community:
# - xxxxxx # 指定 community,当 snmp_exporter snmp.yml 配置文件没有指定 community,此处定义的 community 生效。
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: x.x.x.x:9116 ## 本机部署的snmp端口地址
告警规则文件
# cat /data/prometheus/alerts/node-rule.yml
###按照格式追加 ,instance!="10.x.xx.xx:9100" 即可,添加白名单
groups:
- name: node
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: NodeDown
expr: up * on(instance) group_left(nodename) (node_uname_info) == 0
for: 5m
labels:
serverity: deadly
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
serverity: deadly
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
启动
docker-compose up -d
二. 部署snmp_exporter
项目地址:https://github.com/prometheus/snmp_exporter/releases
下载项目文件
https://github.com/prometheus/snmp_exporter/releases
docker 部署
docker run -d --name snmp-exporter --restart=always -m 1g \
--volume /data/snmp_exporter/conf:/config \
--network host prom/snmp-exporter:v0.20.0 \
--web.listen-address :9116 \
--config.file=/config/snmp.yml
二进制部署
# cat /etc/systemd/system/snmp_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
ExecStart=/data/snmp_exporter/snmp_exporter --config.file=/data/snmp_exporter/snmp.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
三. 网络设备型号确定 snmp oid 表查询
上面的snmp.yml 需要自己适配设备品牌和oid号 才能采集到数据
if_mib:
auth:
community: <安全码>
walk:
- 1.3.6.1.2.1.2
- 1.3.6.1.2.1.31.1.1
- 1.3.6.1.4.1.9.2.1 # 交换机cpu的相关信息
- 1.3.6.1.4.1.25506.2.6.1.1.1.1.6.122
- 1.3.6.1.4.1.25506.2.6.1.1.1.1.10.122
- 1.3.6.1.4.1.25506.2.6.1.1.1.1.8.122
- 1.3.6.1.4.1.25506.2.5.1.1.4.1.1.5.7995393
- 1.3.6.1.4.1.35047.1.5.1.6.1
- 1.3.6.1.4.1.35047.1.6.2
- 1.3.6.1.4.1.35047.1.4
- 1.3.6.1.4.1.35047.2.2.19.0
- 1.3.6.1.4.1.35047.2.10.8
- 1.3.6.1.4.1.35047.2.10.9
- 1.3.6.1.4.1.35047.2.10.5
- 1.3.6.1.4.1.35047.1.9.1.4.1
- 1.3.6.1.4.1.35047.2.2.5.0
- 1.3.6.1.4.1.35047.2.2.6.0
get:
- 1.3.6.1.2.1.1.3.0
metrics:
##交换机cpu占用率百分比
- name: ifswich_busyPer
oid: 1.3.6.1.4.1.25506.2.6.1.1.1.1.6.122
type: gauge
help: swichCPU utilization
##交换机内存总大小单位字节byte
- name: ifswich_memsize
oid: 1.3.6.1.4.1.25506.2.6.1.1.1.1.10.122
type: gauge
help: swichMemoryPoolsize
...
snmpwalk工具测试相关oid
snmpwalk -v 2c -c <安全码> x.x.x.x 1.3.6.1.4.1.35047.1.5.1.6