Add Loki Alloy logging baseline

This commit is contained in:
2026-05-16 13:26:49 +02:00
parent 5ada1ad153
commit a5add937f8
17 changed files with 330 additions and 5 deletions
+7 -2
View File
@@ -9,7 +9,7 @@ Monitoring-Stack fuer Grafana + InfluxDB 3 Core. InfluxDB bleibt ohne Public Rou
- Grafana wird ueber Traefik + `authelia@file,secure-headers@file` unter `grafana.kaleschke.info` veroeffentlicht.
- InfluxDB bleibt ohne Traefik-Route. Der HTTP-Port `8181` kann fuer interne Writer wie Home Assistant ueber `INFLUXDB_BIND_IP` auf eine LAN-Adresse gebunden werden; Default ist `127.0.0.1`.
- InfluxDB haengt an zwei Compose-Netzen: `grafana_influx_internal` fuer Grafana und `grafana_influx_lan` fuer das Docker Host-Port-Publishing. Im laufenden Komodo-Stack heissen sie durch den Compose-Projektpraefix `grafana_grafana_influx_internal` und `grafana_grafana_influx_lan`. InfluxDB haengt bewusst nicht im `frontend_net`.
- Grafana provisioning legt eine SQL-Datenquelle fuer InfluxDB 3 Core mit der Datenbank `homelab` an.
- Grafana provisioning legt eine SQL-Datenquelle fuer InfluxDB 3 Core mit der Datenbank `homelab` und eine Loki-Datasource fuer Container-Logs an.
- Der Grafana-Datasource-Token liegt als Secret-Datei auf dem Host und wird beim Containerstart nur containerintern in die fuer Grafana-Provisioning noetige Environment-Variable geladen.
- Home Assistant schreibt mit der InfluxDB-v2-API-Kompatibilitaet nach InfluxDB 3; Details: `docs/HOME_ASSISTANT_INFLUXDB_ECOWITT.md`.
@@ -39,12 +39,15 @@ Monitoring-Stack fuer Grafana + InfluxDB 3 Core. InfluxDB bleibt ohne Public Rou
install -m 600 /dev/null /mnt/user/appdata/secrets/grafana_influxdb_token.txt
```
4. Provisioning-Datei aus dem Git-Checkout auf den Host-Appdata-Pfad kopieren:
4. Provisioning-Dateien aus dem Git-Checkout auf den Host-Appdata-Pfad kopieren:
```bash
mkdir -p /mnt/user/appdata/grafana/provisioning/datasources
mkdir -p /mnt/user/appdata/grafana/provisioning/dashboards
cp /mnt/user/appdata/komodo/core/repos/homelab-infra/ops/grafana-influxdb/provisioning/datasources/influxdb.yml /mnt/user/appdata/grafana/provisioning/datasources/influxdb.yml
cp /mnt/user/appdata/komodo/core/repos/homelab-infra/ops/grafana-influxdb/provisioning/dashboards/* /mnt/user/appdata/grafana/provisioning/dashboards/
chmod 644 /mnt/user/appdata/grafana/provisioning/datasources/influxdb.yml
chmod 644 /mnt/user/appdata/grafana/provisioning/dashboards/*
```
5. Nach dem ersten Start die Datenbank anlegen:
@@ -57,6 +60,8 @@ Monitoring-Stack fuer Grafana + InfluxDB 3 Core. InfluxDB bleibt ohne Public Rou
- `https://grafana.kaleschke.info` oeffnet nach Authelia die Grafana-Loginseite.
- Grafana `Connections -> Data sources -> InfluxDB 3 Core -> Save & test` ist erfolgreich.
- Grafana `Connections -> Data sources -> Loki -> Save & test` ist erfolgreich, sobald der Loki/Alloy-Stack laeuft.
- Die provisionierten Dashboards `Logs - Last 60m`, `Container Restart Events` und `Container Error Rate` sind sichtbar.
- InfluxDB bleibt ohne Public Route. Falls `INFLUXDB_BIND_IP` auf die LAN-IP gesetzt ist, ist Port `8181` nur im internen Netz fuer Writer wie Home Assistant erreichbar.
- `docker ps` zeigt fuer `influxdb3-core` `192.168.178.58:8181->8181/tcp` oder den per `INFLUXDB_BIND_IP` gesetzten Host.
- `ss -ltnp | grep 8181` zeigt einen Listener auf der gebundenen Host-IP.
+3
View File
@@ -26,6 +26,7 @@ services:
- grafana_influxdb_token
networks:
- frontend_net
- backend_net
- grafana_influx_internal
security_opt:
- no-new-privileges:true
@@ -82,6 +83,8 @@ secrets:
networks:
frontend_net:
external: true
backend_net:
external: true
grafana_influx_lan:
driver: bridge
grafana_influx_internal:
@@ -0,0 +1,23 @@
{
"uid": "kallilab-container-error-rate",
"title": "Container Error Rate",
"schemaVersion": 39,
"version": 1,
"refresh": "5m",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"id": 1,
"type": "table",
"title": "Container Errors Last 24h",
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"refId": "A",
"expr": "sum by (container_name) (count_over_time({platform=\"docker\"} |~ \"(?i)(level=error|error|fatal|panic)\" [24h]))"
}
],
"gridPos": { "h": 16, "w": 24, "x": 0, "y": 0 }
}
]
}
@@ -0,0 +1,43 @@
{
"uid": "kallilab-logs-last-60m",
"title": "Last 60 min before now",
"schemaVersion": 39,
"version": 1,
"refresh": "30s",
"time": { "from": "now-60m", "to": "now" },
"templating": {
"list": [
{
"name": "container",
"type": "query",
"datasource": { "type": "loki", "uid": "loki" },
"query": "label_values(container_name)",
"includeAll": true,
"allValue": ".+",
"refresh": 1
}
]
},
"panels": [
{
"id": 1,
"type": "logs",
"title": "Docker Log Stream",
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"refId": "A",
"expr": "{platform=\"docker\", container_name=~\"$container\"}"
}
],
"gridPos": { "h": 20, "w": 24, "x": 0, "y": 0 },
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": false,
"enableLogDetails": true,
"sortOrder": "Descending"
}
}
]
}
@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: KalliLab Observability
orgId: 1
folder: KalliLab Observability
type: file
disableDeletion: false
updateIntervalSeconds: 60
allowUiUpdates: false
options:
path: /etc/grafana/provisioning/dashboards
@@ -0,0 +1,23 @@
{
"uid": "kallilab-restart-events",
"title": "Restart Events",
"schemaVersion": 39,
"version": 1,
"refresh": "5m",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"id": 1,
"type": "heatmap",
"title": "Restart-like Log Events",
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"refId": "A",
"expr": "sum by (container_name) (count_over_time({platform=\"docker\"} |~ \"(?i)(restart|restarting|started|exited|oom)\" [5m]))"
}
],
"gridPos": { "h": 16, "w": 24, "x": 0, "y": 0 }
}
]
}
@@ -16,3 +16,11 @@ datasources:
insecureGrpc: true
secureJsonData:
token: $GRAFANA_INFLUXDB_TOKEN
- name: Loki
uid: loki
type: loki
access: proxy
url: http://loki:3100
isDefault: false
jsonData:
maxLines: 1000
+23
View File
@@ -0,0 +1,23 @@
# Loki / Alloy
Internal logging stack for KalliLab CORE.
## Services
- `loki`: internal log store on `backend_net`, no Traefik route, `auth_enabled: false` because access is limited to internal Docker networking.
- `alloy`: Docker log collector. It mounts `/var/run/docker.sock:ro` as a documented observability exception and forwards Docker container logs to Loki.
## Host sync
Before first deploy, sync the checked-in config files to appdata:
```bash
mkdir -p /mnt/user/appdata/loki/config /mnt/user/appdata/loki/data
mkdir -p /mnt/user/appdata/alloy/config /mnt/user/appdata/alloy/data
cp /mnt/user/services/homelab-infra/ops/loki/config/loki-config.yml /mnt/user/appdata/loki/config/loki-config.yml
cp /mnt/user/services/homelab-infra/ops/loki/config/config.alloy /mnt/user/appdata/alloy/config/config.alloy
```
## Restore posture
Loki data is transient operational telemetry. Docker raw logs remain the first fallback, Loki chunks on disk are a convenience cache, and ntfy critical events provide the external first-crash marker.
+43
View File
@@ -0,0 +1,43 @@
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
}
discovery.relabel "docker_logs" {
targets = []
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container_name"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
target_label = "compose_project"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "compose_service"
}
}
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.containers.targets
labels = { platform = "docker", host = "kallilabcore" }
relabel_rules = discovery.relabel.docker_logs.rules
forward_to = [loki.process.docker.receiver]
}
loki.process "docker" {
forward_to = [loki.write.local.receiver]
stage.docker {}
}
loki.write "local" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}
+45
View File
@@ -0,0 +1,45 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2026-05-16
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 720h
allow_structured_metadata: true
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
delete_request_store: filesystem
+43
View File
@@ -0,0 +1,43 @@
services:
loki:
image: grafana/loki:3.7.2@sha256:191d4fdfb7264f16989f0a57f320872620a5a7c2ceeec6229212c4190ec49b86
container_name: loki
restart: unless-stopped
command:
- -config.file=/etc/loki/loki-config.yml
volumes:
- /mnt/user/appdata/loki/config:/etc/loki:ro
- /mnt/user/appdata/loki/data:/loki
networks:
- backend_net
security_opt:
- no-new-privileges:true
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
interval: 30s
timeout: 10s
retries: 5
start_period: 40s
alloy:
image: grafana/alloy:v1.16.1@sha256:51aeb9d829239345070619dad3edd6873186f913c84f45b365b74574fcb38ec0
container_name: alloy
restart: unless-stopped
command:
- run
- /etc/alloy/config.alloy
- --storage.path=/var/lib/alloy/data
volumes:
- /mnt/user/appdata/alloy/config:/etc/alloy:ro
- /mnt/user/appdata/alloy/data:/var/lib/alloy/data
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- backend_net
security_opt:
- no-new-privileges:true
depends_on:
- loki
networks:
backend_net:
external: true