feat(operator): live Infrastructure probes + honest split between deployed and planned

The Infrastructure page used to read from a mock fixture that lied two ways:
it listed services that aren't deployed (Jitsi, Zulip, Cloudflare, Object
Storage, Postmark) and showed hardcoded uptime/latency for the ones that
are. Now it shows truth from real probes plus a clearly-labelled "planned"
section for the rest.

Backend (services/platform-api):
- New src/health/ module — HealthService runs 9 probes in parallel with a
  1.5s timeout each:
    Stalwart  → TCP stalwart:8080
    OCIS      → HTTP GET ocis:9200/health
    Collabora → HTTP GET collabora:9980/hosting/discovery
    Authentik → HTTP GET authentik-server:9000/-/health/ready/
    Postgres  → TCP postgres:5432
    Mongo     → existing Mongoose connection.db.admin().ping()
    Redis     → TCP redis:6379
    Traefik   → TCP traefik:80
    Platform API → trivially ok (this code is running)
  Status thresholds: ok ≤500ms, warn 500–1500ms, bad on timeout/refuse.
- HealthController exposes GET /health/platform behind JwtAuthGuard, plus
  keeps the existing public GET /health for infra liveness checks.
- Moved the old src/health.controller.ts into the new module.

Frontend (apps/operator):
- /api/health/platform proxy forwards the operator's access token.
- Infrastructure page swaps SERVICES fixture for useFetch with 30s auto-
  refresh + a manual Refresh button. Cards show real status badge + real
  latency; uptime/error stay as em-dash with a "no probe history yet"
  tooltip until a Prometheus/event-log backend lands.
- Below the live grid, a "Planned · not deployed" section renders 5 dimmed
  cards (Jitsi, Zulip, simpledns.plus, Hetzner Object Storage, Postmark).
  simpledns.plus replaces the misnamed Cloudflare entry — we use
  simpledns.plus, not Cloudflare.
- Subtitle is now truthful: "8 / 9 services live · checked 2s ago".

Verified: stopped redis → card flipped to "down · getaddrinfo ENOTFOUND
redis", subtitle reflected 8/9, incident banner appeared. Restarted →
back to 9/9, banner gone.

SERVICES fixture stays in place for Overview's incident banner — replacing
that is a separate follow-up tied to the incident-management backend.
This commit is contained in:
Ronni Baslund
2026-05-24 18:47:38 +02:00
parent 9fac11e668
commit 77a09aaf77
8 changed files with 316 additions and 43 deletions
+2 -2
View File
@@ -2,7 +2,7 @@ import { Module } from '@nestjs/common'
import { ConfigModule } from '@nestjs/config'
import { MongooseModule } from '@nestjs/mongoose'
import { AuthModule } from './auth/auth.module.js'
import { HealthController } from './health.controller.js'
import { HealthModule } from './health/health.module.js'
import { PartnersModule } from './partners/partners.module.js'
import { SeedModule } from './seed/seed.module.js'
import { SubscriptionsModule } from './subscriptions/subscriptions.module.js'
@@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js'
process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky',
),
AuthModule,
HealthModule,
TenantsModule,
PartnersModule,
UsersModule,
SubscriptionsModule,
SeedModule,
],
controllers: [HealthController],
})
export class AppModule {}
@@ -1,13 +0,0 @@
import { Controller, Get } from '@nestjs/common'
@Controller('health')
export class HealthController {
@Get()
check() {
return {
status: 'ok',
service: 'dezky-platform-api',
timestamp: new Date().toISOString(),
}
}
}
@@ -0,0 +1,28 @@
import { Controller, Get, UseGuards } from '@nestjs/common'
import { JwtAuthGuard } from '../auth/jwt-auth.guard.js'
import { HealthService } from './health.service.js'
@Controller('health')
export class HealthController {
constructor(private readonly health: HealthService) {}
// Public liveness probe — used by infra (Docker / k8s) to know the
// platform-api process is alive. Intentionally not behind auth.
@Get()
check() {
return {
status: 'ok',
service: 'dezky-platform-api',
timestamp: new Date().toISOString(),
}
}
// Aggregated probes of every neighbouring service in the stack. Behind
// JwtAuthGuard because we don't want this read by random unauthenticated
// clients — it leaks the topology of the deployment.
@Get('platform')
@UseGuards(JwtAuthGuard)
async platform() {
return this.health.probeAll()
}
}
@@ -0,0 +1,11 @@
import { Module } from '@nestjs/common'
import { AuthModule } from '../auth/auth.module.js'
import { HealthController } from './health.controller.js'
import { HealthService } from './health.service.js'
@Module({
imports: [AuthModule],
controllers: [HealthController],
providers: [HealthService],
})
export class HealthModule {}
@@ -0,0 +1,151 @@
// Live health probes for the services we expect to find in the Dezky stack.
// Hostnames + ports are the compose service names from
// infrastructure/docker-compose/docker-compose.yml. When we move to k3s,
// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...).
import { Injectable } from '@nestjs/common'
import { InjectConnection } from '@nestjs/mongoose'
import type { Connection } from 'mongoose'
import * as net from 'node:net'
export type ProbeStatus = 'ok' | 'warn' | 'bad'
export interface ProbeResult {
id: string
name: string
role: string
status: ProbeStatus
latencyMs: number | null
error?: string
checkedAt: string
}
interface ProbeSpec {
id: string
name: string
role: string
// The probe returns either nothing (success) or throws / returns a reason
// (failure). Latency is measured around the call by the runner.
run(timeoutMs: number): Promise<void>
}
// Generous-ish per-probe budget. Probes run in parallel, so the total
// /health/platform response should be ~timeout regardless of count.
const PROBE_TIMEOUT_MS = 1500
const WARN_THRESHOLD_MS = 500
@Injectable()
export class HealthService {
constructor(@InjectConnection() private readonly mongo: Connection) {}
async probeAll(): Promise<ProbeResult[]> {
const probes: ProbeSpec[] = [
{ id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) },
{ id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) },
{ id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) },
{ id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) },
{ id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) },
{ id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) },
{ id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) },
{ id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) },
// platform-api itself: this code is running, so it's trivially ok.
{ id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } },
]
const checkedAt = new Date().toISOString()
const results = await Promise.all(probes.map((p) => run(p, checkedAt)))
return results
}
private async mongoPing(timeoutMs: number): Promise<void> {
if (this.mongo.readyState !== 1) {
throw new Error(`mongoose readyState=${this.mongo.readyState}`)
}
await withTimeout(
// db is defined once the connection is open; the readyState guard above
// ensures that.
this.mongo.db!.admin().ping(),
timeoutMs,
'mongo ping timed out',
)
}
}
// ── Runner ─────────────────────────────────────────────────────────────────
async function run(spec: ProbeSpec, checkedAt: string): Promise<ProbeResult> {
const start = Date.now()
try {
await spec.run(PROBE_TIMEOUT_MS)
const latencyMs = Date.now() - start
return {
id: spec.id,
name: spec.name,
role: spec.role,
status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok',
latencyMs,
checkedAt,
}
} catch (err) {
const latencyMs = Date.now() - start
return {
id: spec.id,
name: spec.name,
role: spec.role,
status: 'bad',
latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null,
error: err instanceof Error ? err.message : String(err),
checkedAt,
}
}
}
// ── Primitives ─────────────────────────────────────────────────────────────
function tcpProbe(host: string, port: number, timeoutMs: number): Promise<void> {
return new Promise((resolve, reject) => {
const socket = net.createConnection({ host, port })
const timer = setTimeout(() => {
socket.destroy()
reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`))
}, timeoutMs)
socket.once('connect', () => {
clearTimeout(timer)
socket.end()
resolve()
})
socket.once('error', (err) => {
clearTimeout(timer)
reject(err)
})
})
}
async function httpProbe(url: string, timeoutMs: number): Promise<void> {
const controller = new AbortController()
const timer = setTimeout(() => controller.abort(), timeoutMs)
try {
const res = await fetch(url, { signal: controller.signal, method: 'GET' })
if (!res.ok) {
throw new Error(`HTTP ${res.status} from ${url}`)
}
} finally {
clearTimeout(timer)
}
}
function withTimeout<T>(p: Promise<T>, timeoutMs: number, msg: string): Promise<T> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error(msg)), timeoutMs)
p.then(
(v) => {
clearTimeout(timer)
resolve(v)
},
(e) => {
clearTimeout(timer)
reject(e)
},
)
})
}