1. 安装前准备
确保你已安装以下组件:
# 安装 Docker
curl -fsSL https://get.docker.com | bash
# 安装 Docker Compose
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" \
-o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
docker-compose --version
2. 创建项目目录结构
mkdir -p ~/prometheus-stack
cd ~/prometheus-stack
mkdir -p prometheus
mkdir -p alertmanager
3. 创建配置文件
nano docker-compose.yml
粘贴以下内容:
version: '3.3'
services:
prometheus:
image: prom/prometheus
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus:/etc/prometheus/rules
restart: unless-stopped
grafana:
image: grafana/grafana-oss
container_name: grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-storage:/var/lib/grafana
restart: unless-stopped
alertmanager:
image: prom/alertmanager
container_name: alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
restart: unless-stopped
volumes:
grafana-storage:
nano prometheus.yml
内容如下
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- "xxxxx:9093"
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/*.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
# The label name is added as a label `label_name=<label_value>` to any timeseries scraped from this config.
labels:
app: "prometheus"
- job_name: "agent_windows"
static_configs:
- targets: ["xxxx:9182"]
labels:
app: "windows"
instance: "虚拟机"
- job_name: "alertmanager"
static_configs:
- targets: ["localhost:9093"]
labels:
app: "alertmanager"
- job_name: "node_exporter"
static_configs:
- targets: ["xxxx:9100"]
labels:
app: "node_exporter"
instance: "阿里云测试服务器"
- job_name: "process_exporter"
static_configs:
- targets: ["xxxx:9256"]
labels:
app: "process"
instance: "阿里云测试服务器"
- job_name: 'http_probe'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https:XXXXXX
labels:
instance: "公司主页"
app: webapp
- targets:
- http://xxxxxxxx:9997/docs#/
labels:
instance: "PDF合并"
app: pdfservice
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
#- source_labels: [__param_target]
# target_label: url
# 保留 static_configs 里的 instance 标签,不覆盖
# 也可以注释掉下一行,避免覆盖instance标签
# - source_labels: [__param_target]
# target_label: instance
- target_label: __address__
replacement: xxxxxxx:9115
nano alertmanager.yml
route:
group_by: ['alertname'] # 按 alertname 标签分组告警(相同告警合并通知)
group_wait: 30s # 第一次告警延迟30秒再发送,防止太快触发
group_interval: 5m # 同一组告警发送间隔至少5分钟(防止频繁通知)
repeat_interval: 1h # 告警持续存在,重复通知间隔1小时
receiver: 'webhook_receiver' # 默认发送接收器名称
receivers:
- name: 'webhook_receiver'
webhook_configs:
- url: 'http://xxxx:5012/alertmanager_to_feishu'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
剩下的配置文件就可以添加到prometheus/文件夹下
4. 启动服务
docker-compose up -d