Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Docker Compose Prometheus AlertManager Grafana

14 Jun 2022 »

https://www.jlcp.com.br/deploy-stack-prometheus-em-docker/

version: '3.7'

services:
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alert.rules:/etc/prometheus/alert.rules
    ports:
      - 9090:9090
    networks:
      - "monitoring-network"
  grafana:
    image: grafana/grafana
    container_name: grafana
    volumes:
      - ./data/grafana/data:/var/lib/grafana
    ports:
      - 3000:3000
    networks:
      - "monitoring-network"
    depends_on:
      - prometheus
  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - 9093:9093
    networks:
      - "monitoring-network"

networks:
  monitoring-network:
    driver: bridge

prometheus.yml

# my global config
global:
  scrape_interval:     5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 5s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/etc/prometheus/alert.rules"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'lab000'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['192.168.0.11:9100']

alertmanager.yml

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://127.0.0.1:5001/'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

alert.rules

groups:
   - name: node_alerts
     rules:
     - alert: InstanceDown
       expr: up{job="node_exporter"} == 0
       for: 1m
       labels:
         severity: critical
       annotations:
         summary: Host  of 
   - name: crash_service
     rules:
     - alert: ServiceDown  
       expr: node_systemd_unit_state{name="crash.service",state="active",type="simple"} != 1
       for: 1m
       labels:
         severity: critical
       annotations:
         summary: Host  of