From 8c1d7aa6606cc4ded4ca57f72ecfa29ca801cb37 Mon Sep 17 00:00:00 2001 From: linge Date: Mon, 13 Mar 2023 23:23:34 +0800 Subject: [PATCH] init --- alertmanager/config.yml | 28 ++++++++++ docker-compose.yaml | 107 ++++++++++++++++++++++++++++++++++++++ grafana/config.monitoring | 2 + prometheus/alert.yml | 12 +++++ prometheus/prometheus.yml | 36 +++++++++++++ 5 files changed, 185 insertions(+) create mode 100644 alertmanager/config.yml create mode 100644 docker-compose.yaml create mode 100644 grafana/config.monitoring create mode 100644 prometheus/alert.yml create mode 100644 prometheus/prometheus.yml diff --git a/alertmanager/config.yml b/alertmanager/config.yml new file mode 100644 index 0000000..619e8ac --- /dev/null +++ b/alertmanager/config.yml @@ -0,0 +1,28 @@ +global: + #163服务器 + smtp_smarthost: 'smtp.163.com:465' + #发邮件的邮箱 + smtp_from: 'cdring@163.com' + #发邮件的邮箱用户名,也就是你的邮箱      + smtp_auth_username: 'cdring@163.com' + #发邮件的邮箱密码 + smtp_auth_password: 'your-password' + #进行tls验证 + smtp_require_tls: false + +route: + group_by: ['alertname'] + # 当收到告警的时候,等待group_wait配置的时间,看是否还有告警,如果有就一起发出去 + group_wait: 10s + # 如果上次告警信息发送成功,此时又来了一个新的告警数据,则需要等待group_interval配置的时间才可以发送出去 + group_interval: 10s + # 如果上次告警信息发送成功,且问题没有解决,则等待 repeat_interval配置的时间再次发送告警数据 + repeat_interval: 10m + # 全局报警组,这个参数是必选的 + receiver: email + +receivers: +- name: 'email' + #收邮件的邮箱 + email_configs: + - to: 'cdring@163.com' diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..0079c45 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,107 @@ +version: '3.3' + +volumes: + prometheus_data: {} + grafana_data: {} + +networks: + monitoring: + driver: bridge + +services: + prometheus: + image: prom/prometheus:v2.37.6 + container_name: prometheus + restart: always + volumes: + - /etc/localtime:/etc/localtime:ro + - ./prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + #热加载配置 + - '--web.enable-lifecycle' + #api配置 + #- '--web.enable-admin-api' + #历史数据最大保留时间,默认15天 + - '--storage.tsdb.retention.time=30d' + networks: + - monitoring + links: + - alertmanager + - cadvisor + - node_exporter + expose: + - '9090' + ports: + - 9090:9090 + depends_on: + - cadvisor + + alertmanager: + image: prom/alertmanager:v0.25.0 + container_name: alertmanager + restart: always + volumes: + - /etc/localtime:/etc/localtime:ro + - ./alertmanager/:/etc/alertmanager/ + command: + - '--config.file=/etc/alertmanager/config.yml' + - '--storage.path=/alertmanager' + networks: + - monitoring + expose: + - '9093' + ports: + - 9093:9093 + + cadvisor: + image: google/cadvisor:latest + container_name: cadvisor + restart: always + volumes: + - /etc/localtime:/etc/localtime:ro + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + networks: + - monitoring + expose: + - '8080' + + node_exporter: + image: prom/node-exporter:v1.5.0 + container_name: node-exporter + restart: always + volumes: + - /etc/localtime:/etc/localtime:ro + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + networks: + - monitoring + ports: + - '9100:9100' + + grafana: + image: grafana/grafana:9.4.3 + container_name: grafana + restart: always + volumes: + - /etc/localtime:/etc/localtime:ro + - grafana_data:/var/lib/grafana + - ./grafana/provisioning/:/etc/grafana/provisioning/ + env_file: + - ./grafana/config.monitoring + networks: + - monitoring + links: + - prometheus + ports: + - 3000:3000 + depends_on: + - prometheus diff --git a/grafana/config.monitoring b/grafana/config.monitoring new file mode 100644 index 0000000..aaba4e0 --- /dev/null +++ b/grafana/config.monitoring @@ -0,0 +1,2 @@ +GF_SECURITY_ADMIN_PASSWORD=password +GF_USERS_ALLOW_SIGN_UP=false diff --git a/prometheus/alert.yml b/prometheus/alert.yml new file mode 100644 index 0000000..32cfadf --- /dev/null +++ b/prometheus/alert.yml @@ -0,0 +1,12 @@ +groups: +- name: Prometheus alert + rules: + # 对任何实例超过30秒无法联系的情况发出警报 + - alert: 服务告警 + expr: up == 0 + for: 30s + labels: + severity: critical + annotations: + instance: "{{ $labels.instance }}" + description: "{{ $labels.job }} 服务已关闭" diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml new file mode 100644 index 0000000..d87def3 --- /dev/null +++ b/prometheus/prometheus.yml @@ -0,0 +1,36 @@ +# 全局配置 +global: + scrape_interval: 15s # 将搜刮间隔设置为每15秒一次。默认是每1分钟一次。 + evaluation_interval: 15s # 每15秒评估一次规则。默认是每1分钟一次。 + +# Alertmanager 配置 +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +# 报警(触发器)配置 +rule_files: + - "alert.yml" + +# 搜刮配置 +scrape_configs: + - job_name: 'prometheus' + # 覆盖全局默认值,每15秒从该作业中刮取一次目标 + scrape_interval: 15s + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'cadvisor' + scrape_interval: 15s + static_configs: + - targets: ['cadvisor:8080'] + labels: + instance: Prometheus服务器 + + - job_name: 'node-exporter' + scrape_interval: 15s + static_configs: + - targets: ['node_exporter:9100'] + labels: + instance: Prometheus服务器