From 58eb53a6a85b64773feb8e66d177d3464fe59126 Mon Sep 17 00:00:00 2001 From: Micha Date: Sat, 16 May 2026 21:59:20 +0200 Subject: [PATCH] ops: add monitoring compose stack --- monitoring/docker-compose.yml | 234 ++++++++++++++++++ .../provisioning/dashboards/dashboards.yml | 13 + .../provisioning/datasources/datasources.yml | 19 ++ monitoring/loki/loki-config.yml | 47 ++++ monitoring/prometheus/prometheus.yml | 28 +++ monitoring/promtail/promtail-config.yml | 34 +++ 6 files changed, 375 insertions(+) create mode 100644 monitoring/docker-compose.yml create mode 100644 monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 monitoring/loki/loki-config.yml create mode 100644 monitoring/prometheus/prometheus.yml create mode 100644 monitoring/promtail/promtail-config.yml diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000..e5d129d --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,234 @@ +services: + prometheus: + image: prom/prometheus:v3.7.3 + container_name: monitoring-prometheus + restart: unless-stopped + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=30d + - --web.enable-lifecycle + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + networks: + - monitoring_net + extra_hosts: + # Lets Prometheus scrape host-published Traefik metrics on Unraid. + - host.docker.internal:host-gateway + expose: + - "9090" + security_opt: + - no-new-privileges:true + depends_on: + - node-exporter + - cadvisor + + loki: + image: grafana/loki:3.7.2 + container_name: monitoring-loki + restart: unless-stopped + command: + - -config.file=/etc/loki/loki-config.yml + volumes: + - ./loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - loki_data:/loki + networks: + - monitoring_net + expose: + - "3100" + security_opt: + - no-new-privileges:true + + promtail: + image: grafana/promtail:3.6.10 + container_name: monitoring-promtail + restart: unless-stopped + command: + - -config.file=/etc/promtail/promtail-config.yml + volumes: + - ./promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro + - promtail_positions:/positions + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + networks: + - monitoring_net + security_opt: + - no-new-privileges:true + depends_on: + - loki + + grafana: + image: grafana/grafana:12.4.3 + container_name: monitoring-grafana + restart: unless-stopped + environment: + GF_SERVER_ROOT_URL: https://monitoring.kaleschke.info/ + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: ${GF_SECURITY_ADMIN_PASSWORD} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_AUTH_ANONYMOUS_ENABLED: "false" + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + networks: + - monitoring_net + - frontend_net + expose: + - "3000" + security_opt: + - no-new-privileges:true + depends_on: + - prometheus + - loki + labels: + - traefik.enable=true + - traefik.docker.network=frontend_net + - traefik.http.routers.monitoring-grafana.rule=Host(`monitoring.kaleschke.info`) + - traefik.http.routers.monitoring-grafana.entrypoints=websecure + - traefik.http.routers.monitoring-grafana.tls=true + - traefik.http.routers.monitoring-grafana.tls.certresolver=le + - traefik.http.routers.monitoring-grafana.middlewares=authelia@docker,secure-headers@file + - traefik.http.services.monitoring-grafana.loadbalancer.server.port=3000 + # Docker-provider Authelia middleware requested for this stack. + - traefik.http.middlewares.authelia.forwardauth.address=http://authelia:9091/api/authz/forward-auth + - traefik.http.middlewares.authelia.forwardauth.trustForwardHeader=true + - traefik.http.middlewares.authelia.forwardauth.authResponseHeaders=Remote-User,Remote-Groups,Remote-Name,Remote-Email + + grafana-dashboard-importer: + image: python:3.13-alpine + container_name: monitoring-grafana-dashboard-importer + restart: "no" + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GF_SECURITY_ADMIN_PASSWORD} + networks: + - monitoring_net + - frontend_net + security_opt: + - no-new-privileges:true + depends_on: + - grafana + command: + - /bin/sh + - -c + - | + python - <<'PY' + import base64 + import json + import os + import time + import urllib.error + import urllib.request + + grafana_url = "http://grafana:3000" + password = os.environ["GF_SECURITY_ADMIN_PASSWORD"] + auth = base64.b64encode(f"admin:{password}".encode()).decode() + headers = { + "Authorization": f"Basic {auth}", + "Content-Type": "application/json", + } + + def request(path, payload=None, timeout=20): + data = None if payload is None else json.dumps(payload).encode() + req = urllib.request.Request(f"{grafana_url}{path}", data=data, headers=headers) + if payload is not None: + req.method = "POST" + with urllib.request.urlopen(req, timeout=timeout) as response: + body = response.read() + return json.loads(body.decode() or "{}") if body else {} + + for _ in range(60): + try: + request("/api/health", timeout=5) + break + except Exception: + time.sleep(2) + else: + raise SystemExit("Grafana did not become ready in time") + + dashboards = [ + (1860, "Prometheus"), + (14282, "Prometheus"), + (17346, "Prometheus"), + (13639, "Loki"), + ] + + for dashboard_id, default_datasource in dashboards: + with urllib.request.urlopen( + f"https://grafana.com/api/dashboards/{dashboard_id}/revisions/latest/download", + timeout=30, + ) as response: + dashboard = json.loads(response.read().decode()) + + inputs = [] + for item in dashboard.get("__inputs", []): + plugin_id = item.get("pluginId", "").lower() + value = "Loki" if plugin_id == "loki" or default_datasource == "Loki" else "Prometheus" + inputs.append({ + "name": item.get("name"), + "type": item.get("type", "datasource"), + "pluginId": item.get("pluginId", "prometheus"), + "value": value, + }) + + request("/api/dashboards/import", { + "dashboard": dashboard, + "overwrite": True, + "inputs": inputs, + }) + print(f"Imported Grafana dashboard {dashboard_id}") + PY + + node-exporter: + image: prom/node-exporter:v1.9.1 + container_name: monitoring-node-exporter + restart: unless-stopped + command: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/rootfs + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker/.+|var/lib/containers/storage/.+)($|/) + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro,rslave + networks: + - monitoring_net + expose: + - "9100" + security_opt: + - no-new-privileges:true + + cadvisor: + image: ghcr.io/google/cadvisor:v0.53.0 + container_name: monitoring-cadvisor + restart: unless-stopped + command: + - --docker_only=true + - --housekeeping_interval=30s + - --store_container_labels=false + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + - /dev/disk:/dev/disk:ro + networks: + - monitoring_net + expose: + - "8080" + security_opt: + - no-new-privileges:true + +networks: + monitoring_net: + name: monitoring_net + driver: bridge + frontend_net: + external: true + +volumes: + prometheus_data: + loki_data: + promtail_positions: + grafana_data: diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..c2365ff --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: Homelab Imports + orgId: 1 + folder: Homelab + type: file + disableDeletion: false + updateIntervalSeconds: 300 + allowUiUpdates: true + options: + # Grafana.com dashboard IDs are imported by the compose one-shot importer. + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/datasources.yml b/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..61cfa51 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,19 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: 15s + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + maxLines: 1000 diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 0000000..d8c03bf --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,47 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2026-05-16 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 720h + allow_structured_metadata: true + ingestion_rate_mb: 16 + ingestion_burst_size_mb: 32 + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..f3f3ed4 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,28 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + site: kallilabcore + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - prometheus:9090 + + - job_name: node-exporter + static_configs: + - targets: + - node-exporter:9100 + + - job_name: cadvisor + static_configs: + - targets: + - cadvisor:8080 + + - job_name: traefik + metrics_path: /metrics + static_configs: + # Expects Traefik metrics to be reachable on the Unraid host at port 8082. + - targets: + - host.docker.internal:8082 diff --git a/monitoring/promtail/promtail-config.yml b/monitoring/promtail/promtail-config.yml new file mode 100644 index 0000000..77d33dd --- /dev/null +++ b/monitoring/promtail/promtail-config.yml @@ -0,0 +1,34 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /positions/positions.yml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 10s + relabel_configs: + - source_labels: + - __meta_docker_container_name + regex: /(.+) + target_label: container + - source_labels: + - __meta_docker_container_log_stream + target_label: stream + - source_labels: + - __meta_docker_container_label_com_docker_compose_project + target_label: compose_project + - source_labels: + - __meta_docker_container_label_com_docker_compose_service + target_label: compose_service + # Docker json-file logs live under /var/lib/docker/containers//-json.log. + - source_labels: + - __meta_docker_container_id + target_label: __path__ + replacement: /var/lib/docker/containers/$1/$1-json.log