init
This commit is contained in:
28
alertmanager/config.yml
Normal file
28
alertmanager/config.yml
Normal file
@ -0,0 +1,28 @@
|
||||
global:
|
||||
#163服务器
|
||||
smtp_smarthost: 'smtp.163.com:465'
|
||||
#发邮件的邮箱
|
||||
smtp_from: 'cdring@163.com'
|
||||
#发邮件的邮箱用户名,也就是你的邮箱
|
||||
smtp_auth_username: 'cdring@163.com'
|
||||
#发邮件的邮箱密码
|
||||
smtp_auth_password: 'your-password'
|
||||
#进行tls验证
|
||||
smtp_require_tls: false
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
# 当收到告警的时候,等待group_wait配置的时间,看是否还有告警,如果有就一起发出去
|
||||
group_wait: 10s
|
||||
# 如果上次告警信息发送成功,此时又来了一个新的告警数据,则需要等待group_interval配置的时间才可以发送出去
|
||||
group_interval: 10s
|
||||
# 如果上次告警信息发送成功,且问题没有解决,则等待 repeat_interval配置的时间再次发送告警数据
|
||||
repeat_interval: 10m
|
||||
# 全局报警组,这个参数是必选的
|
||||
receiver: email
|
||||
|
||||
receivers:
|
||||
- name: 'email'
|
||||
#收邮件的邮箱
|
||||
email_configs:
|
||||
- to: 'cdring@163.com'
|
107
docker-compose.yaml
Normal file
107
docker-compose.yaml
Normal file
@ -0,0 +1,107 @@
|
||||
version: '3.3'
|
||||
|
||||
volumes:
|
||||
prometheus_data: {}
|
||||
grafana_data: {}
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.37.6
|
||||
container_name: prometheus
|
||||
restart: always
|
||||
volumes:
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- ./prometheus/:/etc/prometheus/
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
#热加载配置
|
||||
- '--web.enable-lifecycle'
|
||||
#api配置
|
||||
#- '--web.enable-admin-api'
|
||||
#历史数据最大保留时间,默认15天
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
networks:
|
||||
- monitoring
|
||||
links:
|
||||
- alertmanager
|
||||
- cadvisor
|
||||
- node_exporter
|
||||
expose:
|
||||
- '9090'
|
||||
ports:
|
||||
- 9090:9090
|
||||
depends_on:
|
||||
- cadvisor
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.25.0
|
||||
container_name: alertmanager
|
||||
restart: always
|
||||
volumes:
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- ./alertmanager/:/etc/alertmanager/
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
networks:
|
||||
- monitoring
|
||||
expose:
|
||||
- '9093'
|
||||
ports:
|
||||
- 9093:9093
|
||||
|
||||
cadvisor:
|
||||
image: google/cadvisor:latest
|
||||
container_name: cadvisor
|
||||
restart: always
|
||||
volumes:
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
networks:
|
||||
- monitoring
|
||||
expose:
|
||||
- '8080'
|
||||
|
||||
node_exporter:
|
||||
image: prom/node-exporter:v1.5.0
|
||||
container_name: node-exporter
|
||||
restart: always
|
||||
volumes:
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- '9100:9100'
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:9.4.3
|
||||
container_name: grafana
|
||||
restart: always
|
||||
volumes:
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning/:/etc/grafana/provisioning/
|
||||
env_file:
|
||||
- ./grafana/config.monitoring
|
||||
networks:
|
||||
- monitoring
|
||||
links:
|
||||
- prometheus
|
||||
ports:
|
||||
- 3000:3000
|
||||
depends_on:
|
||||
- prometheus
|
2
grafana/config.monitoring
Normal file
2
grafana/config.monitoring
Normal file
@ -0,0 +1,2 @@
|
||||
GF_SECURITY_ADMIN_PASSWORD=password
|
||||
GF_USERS_ALLOW_SIGN_UP=false
|
12
prometheus/alert.yml
Normal file
12
prometheus/alert.yml
Normal file
@ -0,0 +1,12 @@
|
||||
groups:
|
||||
- name: Prometheus alert
|
||||
rules:
|
||||
# 对任何实例超过30秒无法联系的情况发出警报
|
||||
- alert: 服务告警
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
instance: "{{ $labels.instance }}"
|
||||
description: "{{ $labels.job }} 服务已关闭"
|
36
prometheus/prometheus.yml
Normal file
36
prometheus/prometheus.yml
Normal file
@ -0,0 +1,36 @@
|
||||
# 全局配置
|
||||
global:
|
||||
scrape_interval: 15s # 将搜刮间隔设置为每15秒一次。默认是每1分钟一次。
|
||||
evaluation_interval: 15s # 每15秒评估一次规则。默认是每1分钟一次。
|
||||
|
||||
# Alertmanager 配置
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# 报警(触发器)配置
|
||||
rule_files:
|
||||
- "alert.yml"
|
||||
|
||||
# 搜刮配置
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
# 覆盖全局默认值,每15秒从该作业中刮取一次目标
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
labels:
|
||||
instance: Prometheus服务器
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['node_exporter:9100']
|
||||
labels:
|
||||
instance: Prometheus服务器
|
Reference in New Issue
Block a user