https://www.jlcp.com.br/deploy-stack-prometheus-em-docker/
version: '3.7'
services:
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alert.rules:/etc/prometheus/alert.rules
ports:
- 9090:9090
networks:
- "monitoring-network"
grafana:
image: grafana/grafana
container_name: grafana
volumes:
- ./data/grafana/data:/var/lib/grafana
ports:
- 3000:3000
networks:
- "monitoring-network"
depends_on:
- prometheus
alertmanager:
image: prom/alertmanager
container_name: alertmanager
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- 9093:9093
networks:
- "monitoring-network"
networks:
monitoring-network:
driver: bridge
prometheus.yml
# my global config
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 5s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/alert.rules"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'lab000'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['192.168.0.11:9100']
alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
alert.rules
groups:
- name: node_alerts
rules:
- alert: InstanceDown
expr: up{job="node_exporter"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: Host of
- name: crash_service
rules:
- alert: ServiceDown
expr: node_systemd_unit_state{name="crash.service",state="active",type="simple"} != 1
for: 1m
labels:
severity: critical
annotations:
summary: Host of