From 77a09aaf777e779bd3b0d5169960f9b0d973aa6b Mon Sep 17 00:00:00 2001 From: Ronni Baslund Date: Sun, 24 May 2026 18:47:38 +0200 Subject: [PATCH] feat(operator): live Infrastructure probes + honest split between deployed and planned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Infrastructure page used to read from a mock fixture that lied two ways: it listed services that aren't deployed (Jitsi, Zulip, Cloudflare, Object Storage, Postmark) and showed hardcoded uptime/latency for the ones that are. Now it shows truth from real probes plus a clearly-labelled "planned" section for the rest. Backend (services/platform-api): - New src/health/ module — HealthService runs 9 probes in parallel with a 1.5s timeout each: Stalwart → TCP stalwart:8080 OCIS → HTTP GET ocis:9200/health Collabora → HTTP GET collabora:9980/hosting/discovery Authentik → HTTP GET authentik-server:9000/-/health/ready/ Postgres → TCP postgres:5432 Mongo → existing Mongoose connection.db.admin().ping() Redis → TCP redis:6379 Traefik → TCP traefik:80 Platform API → trivially ok (this code is running) Status thresholds: ok ≤500ms, warn 500–1500ms, bad on timeout/refuse. - HealthController exposes GET /health/platform behind JwtAuthGuard, plus keeps the existing public GET /health for infra liveness checks. - Moved the old src/health.controller.ts into the new module. Frontend (apps/operator): - /api/health/platform proxy forwards the operator's access token. - Infrastructure page swaps SERVICES fixture for useFetch with 30s auto- refresh + a manual Refresh button. Cards show real status badge + real latency; uptime/error stay as em-dash with a "no probe history yet" tooltip until a Prometheus/event-log backend lands. - Below the live grid, a "Planned · not deployed" section renders 5 dimmed cards (Jitsi, Zulip, simpledns.plus, Hetzner Object Storage, Postmark). simpledns.plus replaces the misnamed Cloudflare entry — we use simpledns.plus, not Cloudflare. - Subtitle is now truthful: "8 / 9 services live · checked 2s ago". Verified: stopped redis → card flipped to "down · getaddrinfo ENOTFOUND redis", subtitle reflected 8/9, incident banner appeared. Restarted → back to 9/9, banner gone. SERVICES fixture stays in place for Overview's incident banner — replacing that is a separate follow-up tied to the incident-management backend. --- apps/operator/data/fixtures.ts | 18 +++ apps/operator/pages/infrastructure.vue | 131 +++++++++++---- .../server/api/health/platform.get.ts | 3 + services/platform-api/src/app.module.ts | 4 +- .../platform-api/src/health.controller.ts | 13 -- .../src/health/health.controller.ts | 28 ++++ .../platform-api/src/health/health.module.ts | 11 ++ .../platform-api/src/health/health.service.ts | 151 ++++++++++++++++++ 8 files changed, 316 insertions(+), 43 deletions(-) create mode 100644 apps/operator/server/api/health/platform.get.ts delete mode 100644 services/platform-api/src/health.controller.ts create mode 100644 services/platform-api/src/health/health.controller.ts create mode 100644 services/platform-api/src/health/health.module.ts create mode 100644 services/platform-api/src/health/health.service.ts diff --git a/apps/operator/data/fixtures.ts b/apps/operator/data/fixtures.ts index 96c77ec..18baace 100644 --- a/apps/operator/data/fixtures.ts +++ b/apps/operator/data/fixtures.ts @@ -108,6 +108,24 @@ export const OP_AUDIT: AuditEntry[] = [ { id: 'op_8811', when: '09:30:00', actor: 'Anne Baslund', role: 'platform admin', action: 'tos.published', target: 'v2026.05 · all tenants', tenant: '—', ip: '10.0.4.18', tone: 'info' }, ] +// Services in the design that haven't been deployed yet. Surfaced as a +// separate "Planned" section on the Infrastructure page so the operator sees +// honest deployment state instead of a fake all-green grid. +export interface PlannedService { + id: string + name: string + role: string + note: string +} + +export const PLANNED_SERVICES: PlannedService[] = [ + { id: 'jitsi', name: 'Jitsi', role: 'Video meetings', note: 'Lands with docker-compose.optional.yml (Phase 7)' }, + { id: 'zulip', name: 'Zulip', role: 'Team chat', note: 'Lands with docker-compose.optional.yml (Phase 7)' }, + { id: 'dns', name: 'simpledns.plus', role: 'DNS · authoritative', note: 'External SaaS · prod only' }, + { id: 'objstore', name: 'Hetzner Object Storage', role: 'Files · S3 backend for OCIS', note: 'External · prod only' }, + { id: 'smtp-out', name: 'Postmark', role: 'Outbound SMTP · transactional email', note: 'External SaaS · prod only' }, +] + export type NotificationKind = 'security' | 'user' | 'billing' | 'integration' | 'support' | 'signin' export type NotificationTone = 'warn' | 'info' | 'neutral' | 'ok' | 'bad' export interface NotificationItem { diff --git a/apps/operator/pages/infrastructure.vue b/apps/operator/pages/infrastructure.vue index 62af2df..ee49e18 100644 --- a/apps/operator/pages/infrastructure.vue +++ b/apps/operator/pages/infrastructure.vue @@ -1,15 +1,55 @@ @@ -18,17 +58,13 @@ function label(s: PlatformService) { @@ -40,33 +76,63 @@ function label(s: PlatformService) {
{{ INCIDENT.title }}
-
Started {{ INCIDENT.started }} · IC: {{ INCIDENT.ic }}
+
{{ degradedCount }} service(s) reporting non-ok status · IC: {{ INCIDENT.ic }}
- Open incident + Open incident + Live · {{ totalCount }} services +
- + +
+
+
{{ p.name }}
+ {{ p.role }} +
+ {{ label(p) }} +
+
+ + + +
+
+ probed {{ checkedAgo }} + + {{ p.status === 'bad' ? 'down' : 'slow' }} · {{ p.error.slice(0, 32) }} + + {{ p.status === 'ok' ? 'ok' : 'check details' }} +
+
+
+ + Planned · {{ PLANNED_SERVICES.length }} services · not deployed + +
+
{{ s.name }}
{{ s.role }}
- {{ label(s) }} + not deployed
-
- - - -
-
- last incident · {{ s.last }} - details → +
+ {{ s.note }}
- // mock fixtures — wire up to Docker healthchecks + Prometheus in a follow-up + + // probes live in services/platform-api/src/health/. uptime / error rate stay + em-dashed until a probe history (Prometheus, persisted event log) lands — + see "Real observability" in NEXT-STEPS.md follow-ups +
@@ -74,6 +140,8 @@ function label(s: PlatformService) { diff --git a/apps/operator/server/api/health/platform.get.ts b/apps/operator/server/api/health/platform.get.ts new file mode 100644 index 0000000..9691e1b --- /dev/null +++ b/apps/operator/server/api/health/platform.get.ts @@ -0,0 +1,3 @@ +import { platformApi } from '~~/server/utils/platform-api' + +export default defineEventHandler((event) => platformApi(event, '/health/platform')) diff --git a/services/platform-api/src/app.module.ts b/services/platform-api/src/app.module.ts index 63170b0..e159b8e 100644 --- a/services/platform-api/src/app.module.ts +++ b/services/platform-api/src/app.module.ts @@ -2,7 +2,7 @@ import { Module } from '@nestjs/common' import { ConfigModule } from '@nestjs/config' import { MongooseModule } from '@nestjs/mongoose' import { AuthModule } from './auth/auth.module.js' -import { HealthController } from './health.controller.js' +import { HealthModule } from './health/health.module.js' import { PartnersModule } from './partners/partners.module.js' import { SeedModule } from './seed/seed.module.js' import { SubscriptionsModule } from './subscriptions/subscriptions.module.js' @@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js' process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky', ), AuthModule, + HealthModule, TenantsModule, PartnersModule, UsersModule, SubscriptionsModule, SeedModule, ], - controllers: [HealthController], }) export class AppModule {} diff --git a/services/platform-api/src/health.controller.ts b/services/platform-api/src/health.controller.ts deleted file mode 100644 index c97d951..0000000 --- a/services/platform-api/src/health.controller.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { Controller, Get } from '@nestjs/common' - -@Controller('health') -export class HealthController { - @Get() - check() { - return { - status: 'ok', - service: 'dezky-platform-api', - timestamp: new Date().toISOString(), - } - } -} diff --git a/services/platform-api/src/health/health.controller.ts b/services/platform-api/src/health/health.controller.ts new file mode 100644 index 0000000..cd98bb9 --- /dev/null +++ b/services/platform-api/src/health/health.controller.ts @@ -0,0 +1,28 @@ +import { Controller, Get, UseGuards } from '@nestjs/common' +import { JwtAuthGuard } from '../auth/jwt-auth.guard.js' +import { HealthService } from './health.service.js' + +@Controller('health') +export class HealthController { + constructor(private readonly health: HealthService) {} + + // Public liveness probe — used by infra (Docker / k8s) to know the + // platform-api process is alive. Intentionally not behind auth. + @Get() + check() { + return { + status: 'ok', + service: 'dezky-platform-api', + timestamp: new Date().toISOString(), + } + } + + // Aggregated probes of every neighbouring service in the stack. Behind + // JwtAuthGuard because we don't want this read by random unauthenticated + // clients — it leaks the topology of the deployment. + @Get('platform') + @UseGuards(JwtAuthGuard) + async platform() { + return this.health.probeAll() + } +} diff --git a/services/platform-api/src/health/health.module.ts b/services/platform-api/src/health/health.module.ts new file mode 100644 index 0000000..2e1c19c --- /dev/null +++ b/services/platform-api/src/health/health.module.ts @@ -0,0 +1,11 @@ +import { Module } from '@nestjs/common' +import { AuthModule } from '../auth/auth.module.js' +import { HealthController } from './health.controller.js' +import { HealthService } from './health.service.js' + +@Module({ + imports: [AuthModule], + controllers: [HealthController], + providers: [HealthService], +}) +export class HealthModule {} diff --git a/services/platform-api/src/health/health.service.ts b/services/platform-api/src/health/health.service.ts new file mode 100644 index 0000000..d288f3c --- /dev/null +++ b/services/platform-api/src/health/health.service.ts @@ -0,0 +1,151 @@ +// Live health probes for the services we expect to find in the Dezky stack. +// Hostnames + ports are the compose service names from +// infrastructure/docker-compose/docker-compose.yml. When we move to k3s, +// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...). + +import { Injectable } from '@nestjs/common' +import { InjectConnection } from '@nestjs/mongoose' +import type { Connection } from 'mongoose' +import * as net from 'node:net' + +export type ProbeStatus = 'ok' | 'warn' | 'bad' + +export interface ProbeResult { + id: string + name: string + role: string + status: ProbeStatus + latencyMs: number | null + error?: string + checkedAt: string +} + +interface ProbeSpec { + id: string + name: string + role: string + // The probe returns either nothing (success) or throws / returns a reason + // (failure). Latency is measured around the call by the runner. + run(timeoutMs: number): Promise +} + +// Generous-ish per-probe budget. Probes run in parallel, so the total +// /health/platform response should be ~timeout regardless of count. +const PROBE_TIMEOUT_MS = 1500 +const WARN_THRESHOLD_MS = 500 + +@Injectable() +export class HealthService { + constructor(@InjectConnection() private readonly mongo: Connection) {} + + async probeAll(): Promise { + const probes: ProbeSpec[] = [ + { id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) }, + { id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) }, + { id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) }, + { id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) }, + { id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) }, + { id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) }, + { id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) }, + { id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) }, + // platform-api itself: this code is running, so it's trivially ok. + { id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } }, + ] + + const checkedAt = new Date().toISOString() + const results = await Promise.all(probes.map((p) => run(p, checkedAt))) + return results + } + + private async mongoPing(timeoutMs: number): Promise { + if (this.mongo.readyState !== 1) { + throw new Error(`mongoose readyState=${this.mongo.readyState}`) + } + await withTimeout( + // db is defined once the connection is open; the readyState guard above + // ensures that. + this.mongo.db!.admin().ping(), + timeoutMs, + 'mongo ping timed out', + ) + } +} + +// ── Runner ───────────────────────────────────────────────────────────────── + +async function run(spec: ProbeSpec, checkedAt: string): Promise { + const start = Date.now() + try { + await spec.run(PROBE_TIMEOUT_MS) + const latencyMs = Date.now() - start + return { + id: spec.id, + name: spec.name, + role: spec.role, + status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok', + latencyMs, + checkedAt, + } + } catch (err) { + const latencyMs = Date.now() - start + return { + id: spec.id, + name: spec.name, + role: spec.role, + status: 'bad', + latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null, + error: err instanceof Error ? err.message : String(err), + checkedAt, + } + } +} + +// ── Primitives ───────────────────────────────────────────────────────────── + +function tcpProbe(host: string, port: number, timeoutMs: number): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ host, port }) + const timer = setTimeout(() => { + socket.destroy() + reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`)) + }, timeoutMs) + socket.once('connect', () => { + clearTimeout(timer) + socket.end() + resolve() + }) + socket.once('error', (err) => { + clearTimeout(timer) + reject(err) + }) + }) +} + +async function httpProbe(url: string, timeoutMs: number): Promise { + const controller = new AbortController() + const timer = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { signal: controller.signal, method: 'GET' }) + if (!res.ok) { + throw new Error(`HTTP ${res.status} from ${url}`) + } + } finally { + clearTimeout(timer) + } +} + +function withTimeout(p: Promise, timeoutMs: number, msg: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error(msg)), timeoutMs) + p.then( + (v) => { + clearTimeout(timer) + resolve(v) + }, + (e) => { + clearTimeout(timer) + reject(e) + }, + ) + }) +}