From 94270c1f227dfda37d1394dae54b7138049ec094 Mon Sep 17 00:00:00 2001 From: Ronni Baslund Date: Wed, 10 Jun 2026 19:51:25 +0200 Subject: [PATCH] fix(health): env-driven infrastructure probe targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The operator infrastructure page probed docker-compose hostnames (stalwart/postgres/redis/traefik…) which don't resolve in k3s — 7 of 9 services showed down. Probe targets now come from HEALTH_* env vars with the compose names as dev defaults; platform-api-config.yaml sets the in-cluster/host addresses. 'disabled' omits a service from the report — used for OCIS/Collabora until the files tier is deployed. --- .../fleet/apps/platform-api-config.yaml | 10 +++++ .../platform-api/src/health/health.service.ts | 39 +++++++++++++------ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/infrastructure/production/fleet/apps/platform-api-config.yaml b/infrastructure/production/fleet/apps/platform-api-config.yaml index 1abd84f..ef7a954 100644 --- a/infrastructure/production/fleet/apps/platform-api-config.yaml +++ b/infrastructure/production/fleet/apps/platform-api-config.yaml @@ -39,3 +39,13 @@ data: BILLING_STRIPE_ENABLED: "false" BOOKING_PUBLIC_URL: "https://booking.dezky.eu" MEET_PUBLIC_URL: "https://meet.dezky.eu" + # Infrastructure health-probe targets (operator → /health/platform). The + # code defaults are docker-compose hostnames; these are the k3s addresses. + # "disabled" omits a service from the report until that tier is deployed. + HEALTH_STALWART_HOSTPORT: "10.42.0.1:8080" + HEALTH_AUTHENTIK_URL: "http://authentik-server.dezky-auth.svc.cluster.local/-/health/ready/" + HEALTH_POSTGRES_HOSTPORT: "postgres.dezky-data.svc.cluster.local:5432" + HEALTH_REDIS_HOSTPORT: "redis.dezky-data.svc.cluster.local:6379" + HEALTH_TRAEFIK_HOSTPORT: "traefik.kube-system.svc.cluster.local:80" + HEALTH_OCIS_URL: "disabled" + HEALTH_COLLABORA_URL: "disabled" diff --git a/services/platform-api/src/health/health.service.ts b/services/platform-api/src/health/health.service.ts index d288f3c..95e2c7a 100644 --- a/services/platform-api/src/health/health.service.ts +++ b/services/platform-api/src/health/health.service.ts @@ -1,7 +1,9 @@ // Live health probes for the services we expect to find in the Dezky stack. -// Hostnames + ports are the compose service names from -// infrastructure/docker-compose/docker-compose.yml. When we move to k3s, -// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...). +// Targets are env-driven (HEALTH_*): the defaults are the docker-compose +// service names so dev needs no env, and production sets in-cluster DNS / +// host addresses in platform-api-config.yaml. Setting a probe's env to +// "disabled" omits that service from the report entirely — used for tiers +// that aren't deployed yet (OCIS/Collabora in prod). import { Injectable } from '@nestjs/common' import { InjectConnection } from '@nestjs/mongoose' @@ -40,17 +42,17 @@ export class HealthService { async probeAll(): Promise { const probes: ProbeSpec[] = [ - { id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) }, - { id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) }, - { id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) }, - { id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) }, - { id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) }, + tcpSpec('mail', 'Stalwart', 'Mail · IMAP/JMAP/SMTP', 'HEALTH_STALWART_HOSTPORT', 'stalwart:8080'), + httpSpec('files', 'OCIS', 'Files · OwnCloud Infinite', 'HEALTH_OCIS_URL', 'http://ocis:9200/health'), + httpSpec('office','Collabora', 'Office editing · WOPI', 'HEALTH_COLLABORA_URL', 'http://collabora:9980/hosting/discovery'), + httpSpec('auth', 'Authentik', 'Identity · SSO · MFA', 'HEALTH_AUTHENTIK_URL', 'http://authentik-server:9000/-/health/ready/'), + tcpSpec('pg', 'PostgreSQL', 'Authentik + OCIS database', 'HEALTH_POSTGRES_HOSTPORT', 'postgres:5432'), { id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) }, - { id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) }, - { id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) }, + tcpSpec('redis', 'Redis', 'Cache + session store', 'HEALTH_REDIS_HOSTPORT', 'redis:6379'), + tcpSpec('proxy', 'Traefik', 'Reverse proxy · TLS', 'HEALTH_TRAEFIK_HOSTPORT', 'traefik:80'), // platform-api itself: this code is running, so it's trivially ok. { id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } }, - ] + ].filter((p): p is ProbeSpec => p !== null) const checkedAt = new Date().toISOString() const results = await Promise.all(probes.map((p) => run(p, checkedAt))) @@ -100,6 +102,21 @@ async function run(spec: ProbeSpec, checkedAt: string): Promise { } } +// ── Probe spec builders ──────────────────────────────────────────────────── + +function tcpSpec(id: string, name: string, role: string, envKey: string, dflt: string): ProbeSpec | null { + const target = process.env[envKey] || dflt + if (target === 'disabled') return null + const [host, port] = target.split(':') + return { id, name, role, run: () => tcpProbe(host!, Number(port), PROBE_TIMEOUT_MS) } +} + +function httpSpec(id: string, name: string, role: string, envKey: string, dflt: string): ProbeSpec | null { + const url = process.env[envKey] || dflt + if (url === 'disabled') return null + return { id, name, role, run: () => httpProbe(url, PROBE_TIMEOUT_MS) } +} + // ── Primitives ───────────────────────────────────────────────────────────── function tcpProbe(host: string, port: number, timeoutMs: number): Promise {