This commit is contained in:
linge
2023-03-13 23:23:34 +08:00
commit 8c1d7aa660
5 changed files with 185 additions and 0 deletions

28
alertmanager/config.yml Normal file
View File

@ -0,0 +1,28 @@
global:
#163服务器
smtp_smarthost: 'smtp.163.com:465'
#发邮件的邮箱
smtp_from: 'cdring@163.com'
#发邮件的邮箱用户名,也就是你的邮箱     
smtp_auth_username: 'cdring@163.com'
#发邮件的邮箱密码
smtp_auth_password: 'your-password'
#进行tls验证
smtp_require_tls: false
route:
group_by: ['alertname']
# 当收到告警的时候等待group_wait配置的时间看是否还有告警如果有就一起发出去
group_wait: 10s
# 如果上次告警信息发送成功此时又来了一个新的告警数据则需要等待group_interval配置的时间才可以发送出去
group_interval: 10s
# 如果上次告警信息发送成功,且问题没有解决,则等待 repeat_interval配置的时间再次发送告警数据
repeat_interval: 10m
# 全局报警组,这个参数是必选的
receiver: email
receivers:
- name: 'email'
#收邮件的邮箱
email_configs:
- to: 'cdring@163.com'

107
docker-compose.yaml Normal file
View File

@ -0,0 +1,107 @@
version: '3.3'
volumes:
prometheus_data: {}
grafana_data: {}
networks:
monitoring:
driver: bridge
services:
prometheus:
image: prom/prometheus:v2.37.6
container_name: prometheus
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- ./prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
#热加载配置
- '--web.enable-lifecycle'
#api配置
#- '--web.enable-admin-api'
#历史数据最大保留时间默认15天
- '--storage.tsdb.retention.time=30d'
networks:
- monitoring
links:
- alertmanager
- cadvisor
- node_exporter
expose:
- '9090'
ports:
- 9090:9090
depends_on:
- cadvisor
alertmanager:
image: prom/alertmanager:v0.25.0
container_name: alertmanager
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- ./alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
expose:
- '9093'
ports:
- 9093:9093
cadvisor:
image: google/cadvisor:latest
container_name: cadvisor
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- monitoring
expose:
- '8080'
node_exporter:
image: prom/node-exporter:v1.5.0
container_name: node-exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
networks:
- monitoring
ports:
- '9100:9100'
grafana:
image: grafana/grafana:9.4.3
container_name: grafana
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- grafana_data:/var/lib/grafana
- ./grafana/provisioning/:/etc/grafana/provisioning/
env_file:
- ./grafana/config.monitoring
networks:
- monitoring
links:
- prometheus
ports:
- 3000:3000
depends_on:
- prometheus

View File

@ -0,0 +1,2 @@
GF_SECURITY_ADMIN_PASSWORD=password
GF_USERS_ALLOW_SIGN_UP=false

12
prometheus/alert.yml Normal file
View File

@ -0,0 +1,12 @@
groups:
- name: Prometheus alert
rules:
# 对任何实例超过30秒无法联系的情况发出警报
- alert: 服务告警
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
instance: "{{ $labels.instance }}"
description: "{{ $labels.job }} 服务已关闭"

36
prometheus/prometheus.yml Normal file
View File

@ -0,0 +1,36 @@
# 全局配置
global:
scrape_interval: 15s # 将搜刮间隔设置为每15秒一次。默认是每1分钟一次。
evaluation_interval: 15s # 每15秒评估一次规则。默认是每1分钟一次。
# Alertmanager 配置
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
# 报警(触发器)配置
rule_files:
- "alert.yml"
# 搜刮配置
scrape_configs:
- job_name: 'prometheus'
# 覆盖全局默认值每15秒从该作业中刮取一次目标
scrape_interval: 15s
static_configs:
- targets: ['localhost:9090']
- job_name: 'cadvisor'
scrape_interval: 15s
static_configs:
- targets: ['cadvisor:8080']
labels:
instance: Prometheus服务器
- job_name: 'node-exporter'
scrape_interval: 15s
static_configs:
- targets: ['node_exporter:9100']
labels:
instance: Prometheus服务器