feat(operator): live Infrastructure probes + honest split between deployed and planned

The Infrastructure page used to read from a mock fixture that lied two ways:
it listed services that aren't deployed (Jitsi, Zulip, Cloudflare, Object
Storage, Postmark) and showed hardcoded uptime/latency for the ones that
are. Now it shows truth from real probes plus a clearly-labelled "planned"
section for the rest.

Backend (services/platform-api):
- New src/health/ module — HealthService runs 9 probes in parallel with a
  1.5s timeout each:
    Stalwart  → TCP stalwart:8080
    OCIS      → HTTP GET ocis:9200/health
    Collabora → HTTP GET collabora:9980/hosting/discovery
    Authentik → HTTP GET authentik-server:9000/-/health/ready/
    Postgres  → TCP postgres:5432
    Mongo     → existing Mongoose connection.db.admin().ping()
    Redis     → TCP redis:6379
    Traefik   → TCP traefik:80
    Platform API → trivially ok (this code is running)
  Status thresholds: ok ≤500ms, warn 500–1500ms, bad on timeout/refuse.
- HealthController exposes GET /health/platform behind JwtAuthGuard, plus
  keeps the existing public GET /health for infra liveness checks.
- Moved the old src/health.controller.ts into the new module.

Frontend (apps/operator):
- /api/health/platform proxy forwards the operator's access token.
- Infrastructure page swaps SERVICES fixture for useFetch with 30s auto-
  refresh + a manual Refresh button. Cards show real status badge + real
  latency; uptime/error stay as em-dash with a "no probe history yet"
  tooltip until a Prometheus/event-log backend lands.
- Below the live grid, a "Planned · not deployed" section renders 5 dimmed
  cards (Jitsi, Zulip, simpledns.plus, Hetzner Object Storage, Postmark).
  simpledns.plus replaces the misnamed Cloudflare entry — we use
  simpledns.plus, not Cloudflare.
- Subtitle is now truthful: "8 / 9 services live · checked 2s ago".

Verified: stopped redis → card flipped to "down · getaddrinfo ENOTFOUND
redis", subtitle reflected 8/9, incident banner appeared. Restarted →
back to 9/9, banner gone.

SERVICES fixture stays in place for Overview's incident banner — replacing
that is a separate follow-up tied to the incident-management backend.
This commit is contained in:
Ronni Baslund
2026-05-24 18:47:38 +02:00
parent 9fac11e668
commit 77a09aaf77
8 changed files with 316 additions and 43 deletions
+18
View File
@@ -108,6 +108,24 @@ export const OP_AUDIT: AuditEntry[] = [
{ id: 'op_8811', when: '09:30:00', actor: 'Anne Baslund', role: 'platform admin', action: 'tos.published', target: 'v2026.05 · all tenants', tenant: '—', ip: '10.0.4.18', tone: 'info' },
]
// Services in the design that haven't been deployed yet. Surfaced as a
// separate "Planned" section on the Infrastructure page so the operator sees
// honest deployment state instead of a fake all-green grid.
export interface PlannedService {
id: string
name: string
role: string
note: string
}
export const PLANNED_SERVICES: PlannedService[] = [
{ id: 'jitsi', name: 'Jitsi', role: 'Video meetings', note: 'Lands with docker-compose.optional.yml (Phase 7)' },
{ id: 'zulip', name: 'Zulip', role: 'Team chat', note: 'Lands with docker-compose.optional.yml (Phase 7)' },
{ id: 'dns', name: 'simpledns.plus', role: 'DNS · authoritative', note: 'External SaaS · prod only' },
{ id: 'objstore', name: 'Hetzner Object Storage', role: 'Files · S3 backend for OCIS', note: 'External · prod only' },
{ id: 'smtp-out', name: 'Postmark', role: 'Outbound SMTP · transactional email', note: 'External SaaS · prod only' },
]
export type NotificationKind = 'security' | 'user' | 'billing' | 'integration' | 'support' | 'signin'
export type NotificationTone = 'warn' | 'info' | 'neutral' | 'ok' | 'bad'
export interface NotificationItem {
+103 -28
View File
@@ -1,15 +1,55 @@
<script setup lang="ts">
import { SERVICES, INCIDENT, type PlatformService } from '~/data/fixtures'
import { INCIDENT, PLANNED_SERVICES } from '~/data/fixtures'
const degradedCount = computed(() => SERVICES.filter((s) => s.status !== 'ok').length)
const incidentActive = computed(() => degradedCount.value > 0)
const { open: openIncident } = useIncidentModal()
function tone(s: PlatformService): 'ok' | 'warn' | 'bad' {
return s.status
// Shape returned by /api/health/platform on platform-api.
interface ProbeResult {
id: string
name: string
role: string
status: 'ok' | 'warn' | 'bad'
latencyMs: number | null
error?: string
checkedAt: string
}
function label(s: PlatformService) {
return s.status === 'ok' ? 'operational' : s.status === 'warn' ? 'degraded' : 'down'
const { data: probes, pending, refresh } = await useFetch<ProbeResult[]>('/api/health/platform', {
default: () => [],
})
// Auto-refresh: every 30s while this page is mounted.
const now = ref(Date.now())
let pollTimer: ReturnType<typeof setInterval> | null = null
let clockTimer: ReturnType<typeof setInterval> | null = null
onMounted(() => {
pollTimer = setInterval(() => refresh(), 30_000)
clockTimer = setInterval(() => { now.value = Date.now() }, 1_000)
})
onBeforeUnmount(() => {
if (pollTimer) clearInterval(pollTimer)
if (clockTimer) clearInterval(clockTimer)
})
const liveCount = computed(() => (probes.value ?? []).filter((p) => p.status === 'ok').length)
const totalCount = computed(() => (probes.value ?? []).length)
const degradedCount = computed(() => (probes.value ?? []).filter((p) => p.status !== 'ok').length)
const incidentActive = computed(() => degradedCount.value > 0)
const lastCheckedAt = computed(() => {
const first = probes.value?.[0]
return first ? new Date(first.checkedAt).getTime() : null
})
const checkedAgo = computed(() => {
if (!lastCheckedAt.value) return '—'
const s = Math.max(0, Math.floor((now.value - lastCheckedAt.value) / 1000))
return `${s}s ago`
})
function tone(p: ProbeResult): 'ok' | 'warn' | 'bad' {
return p.status
}
function label(p: ProbeResult) {
return p.status === 'ok' ? 'operational' : p.status === 'warn' ? 'degraded' : 'down'
}
</script>
@@ -18,17 +58,13 @@ function label(s: PlatformService) {
<PageHeader
eyebrow="Operations"
title="Infrastructure"
subtitle="Health of every service that makes up the Dezky platform."
:subtitle="`${liveCount} / ${totalCount} services live · checked ${checkedAgo}`"
>
<template #actions>
<UiButton variant="secondary" disabled>
<UiButton variant="secondary" :disabled="pending" @click="refresh()">
<template #leading><UiIcon name="chevDown" :size="13" /></template>
Refresh
</UiButton>
<UiButton variant="secondary" disabled>
<template #leading><UiIcon name="calendar" :size="13" /></template>
Schedule maintenance
</UiButton>
</template>
</PageHeader>
@@ -40,33 +76,63 @@ function label(s: PlatformService) {
</span>
<div class="body">
<div class="title">{{ INCIDENT.title }}</div>
<div class="sub">Started {{ INCIDENT.started }} · IC: {{ INCIDENT.ic }}</div>
<div class="sub">{{ degradedCount }} service(s) reporting non-ok status · IC: {{ INCIDENT.ic }}</div>
</div>
<UiButton variant="primary" @click="openIncident">Open incident</UiButton>
<UiButton variant="primary" disabled>Open incident</UiButton>
</div>
<Eyebrow class="section-head">Live · {{ totalCount }} services</Eyebrow>
<div class="grid">
<Card v-for="s in SERVICES" :key="s.id" :pad="0">
<Card v-for="p in probes" :key="p.id" :pad="0">
<div class="head">
<div>
<div class="name">{{ p.name }}</div>
<Mono dim>{{ p.role }}</Mono>
</div>
<Badge :tone="tone(p)" dot>{{ label(p) }}</Badge>
</div>
<div class="metrics">
<MetricCell label="uptime · 30d" value="—" :title="'no probe history yet'" />
<MetricCell
label="p95 latency"
:value="p.latencyMs !== null ? `${p.latencyMs}ms` : '—'"
:tone="p.latencyMs !== null && p.latencyMs > 300 ? 'warn' : undefined"
/>
<MetricCell label="error rate" value="—" :title="'no probe history yet'" />
</div>
<div class="foot">
<Mono dim>probed {{ checkedAgo }}</Mono>
<Mono v-if="p.status !== 'ok' && p.error" :class="['err', p.status]" :title="p.error">
{{ p.status === 'bad' ? 'down' : 'slow' }} · {{ p.error.slice(0, 32) }}
</Mono>
<Mono v-else dim>{{ p.status === 'ok' ? 'ok' : 'check details' }}</Mono>
</div>
</Card>
</div>
<Eyebrow class="section-head">Planned · {{ PLANNED_SERVICES.length }} services · not deployed</Eyebrow>
<div class="grid planned">
<Card v-for="s in PLANNED_SERVICES" :key="s.id" :pad="0">
<div class="head">
<div>
<div class="name">{{ s.name }}</div>
<Mono dim>{{ s.role }}</Mono>
</div>
<Badge :tone="tone(s)" dot>{{ label(s) }}</Badge>
<Badge tone="neutral" dot>not deployed</Badge>
</div>
<div class="metrics">
<MetricCell label="uptime · 30d" :value="`${s.uptime.toFixed(2)}%`" />
<MetricCell label="p95 latency" :value="`${s.p95}ms`" :tone="s.p95 > 300 ? 'warn' : undefined" />
<MetricCell label="error rate" :value="`${s.err.toFixed(3)}%`" :tone="s.err > 0.04 ? 'warn' : undefined" />
</div>
<div class="foot">
<Mono dim>last incident · {{ s.last }}</Mono>
<Mono dim>details </Mono>
<div class="planned-body">
<Mono dim>{{ s.note }}</Mono>
</div>
</Card>
</div>
<Mono dim class="note">// mock fixtures — wire up to Docker healthchecks + Prometheus in a follow-up</Mono>
<Mono dim class="note">
// probes live in services/platform-api/src/health/. uptime / error rate stay
em-dashed until a probe history (Prometheus, persisted event log) lands
see "Real observability" in NEXT-STEPS.md follow-ups
</Mono>
</div>
</div>
</template>
@@ -74,6 +140,8 @@ function label(s: PlatformService) {
<style scoped>
.stage { padding: 24px 40px 64px 40px; display: flex; flex-direction: column; gap: 16px; }
.section-head { display: block; padding: 6px 4px; }
.incident {
display: flex;
align-items: center;
@@ -103,6 +171,8 @@ function label(s: PlatformService) {
.sub { font-size: 12px; color: var(--text-mute); margin-top: 2px; }
.grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; }
.grid.planned { opacity: 0.6; }
.head {
padding: 16px 18px 12px 18px;
border-bottom: 1px solid var(--border);
@@ -125,7 +195,12 @@ function label(s: PlatformService) {
border-top: 1px solid var(--border);
display: flex;
justify-content: space-between;
gap: 8px;
}
.err.bad { color: var(--bad); }
.err.warn { color: var(--warn); }
.planned-body { padding: 14px 18px; }
.note { display: block; padding: 4px 4px 0 4px; }
</style>
@@ -0,0 +1,3 @@
import { platformApi } from '~~/server/utils/platform-api'
export default defineEventHandler((event) => platformApi(event, '/health/platform'))
+2 -2
View File
@@ -2,7 +2,7 @@ import { Module } from '@nestjs/common'
import { ConfigModule } from '@nestjs/config'
import { MongooseModule } from '@nestjs/mongoose'
import { AuthModule } from './auth/auth.module.js'
import { HealthController } from './health.controller.js'
import { HealthModule } from './health/health.module.js'
import { PartnersModule } from './partners/partners.module.js'
import { SeedModule } from './seed/seed.module.js'
import { SubscriptionsModule } from './subscriptions/subscriptions.module.js'
@@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js'
process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky',
),
AuthModule,
HealthModule,
TenantsModule,
PartnersModule,
UsersModule,
SubscriptionsModule,
SeedModule,
],
controllers: [HealthController],
})
export class AppModule {}
@@ -1,13 +0,0 @@
import { Controller, Get } from '@nestjs/common'
@Controller('health')
export class HealthController {
@Get()
check() {
return {
status: 'ok',
service: 'dezky-platform-api',
timestamp: new Date().toISOString(),
}
}
}
@@ -0,0 +1,28 @@
import { Controller, Get, UseGuards } from '@nestjs/common'
import { JwtAuthGuard } from '../auth/jwt-auth.guard.js'
import { HealthService } from './health.service.js'
@Controller('health')
export class HealthController {
constructor(private readonly health: HealthService) {}
// Public liveness probe — used by infra (Docker / k8s) to know the
// platform-api process is alive. Intentionally not behind auth.
@Get()
check() {
return {
status: 'ok',
service: 'dezky-platform-api',
timestamp: new Date().toISOString(),
}
}
// Aggregated probes of every neighbouring service in the stack. Behind
// JwtAuthGuard because we don't want this read by random unauthenticated
// clients — it leaks the topology of the deployment.
@Get('platform')
@UseGuards(JwtAuthGuard)
async platform() {
return this.health.probeAll()
}
}
@@ -0,0 +1,11 @@
import { Module } from '@nestjs/common'
import { AuthModule } from '../auth/auth.module.js'
import { HealthController } from './health.controller.js'
import { HealthService } from './health.service.js'
@Module({
imports: [AuthModule],
controllers: [HealthController],
providers: [HealthService],
})
export class HealthModule {}
@@ -0,0 +1,151 @@
// Live health probes for the services we expect to find in the Dezky stack.
// Hostnames + ports are the compose service names from
// infrastructure/docker-compose/docker-compose.yml. When we move to k3s,
// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...).
import { Injectable } from '@nestjs/common'
import { InjectConnection } from '@nestjs/mongoose'
import type { Connection } from 'mongoose'
import * as net from 'node:net'
export type ProbeStatus = 'ok' | 'warn' | 'bad'
export interface ProbeResult {
id: string
name: string
role: string
status: ProbeStatus
latencyMs: number | null
error?: string
checkedAt: string
}
interface ProbeSpec {
id: string
name: string
role: string
// The probe returns either nothing (success) or throws / returns a reason
// (failure). Latency is measured around the call by the runner.
run(timeoutMs: number): Promise<void>
}
// Generous-ish per-probe budget. Probes run in parallel, so the total
// /health/platform response should be ~timeout regardless of count.
const PROBE_TIMEOUT_MS = 1500
const WARN_THRESHOLD_MS = 500
@Injectable()
export class HealthService {
constructor(@InjectConnection() private readonly mongo: Connection) {}
async probeAll(): Promise<ProbeResult[]> {
const probes: ProbeSpec[] = [
{ id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) },
{ id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) },
{ id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) },
{ id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) },
{ id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) },
{ id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) },
{ id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) },
{ id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) },
// platform-api itself: this code is running, so it's trivially ok.
{ id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } },
]
const checkedAt = new Date().toISOString()
const results = await Promise.all(probes.map((p) => run(p, checkedAt)))
return results
}
private async mongoPing(timeoutMs: number): Promise<void> {
if (this.mongo.readyState !== 1) {
throw new Error(`mongoose readyState=${this.mongo.readyState}`)
}
await withTimeout(
// db is defined once the connection is open; the readyState guard above
// ensures that.
this.mongo.db!.admin().ping(),
timeoutMs,
'mongo ping timed out',
)
}
}
// ── Runner ─────────────────────────────────────────────────────────────────
async function run(spec: ProbeSpec, checkedAt: string): Promise<ProbeResult> {
const start = Date.now()
try {
await spec.run(PROBE_TIMEOUT_MS)
const latencyMs = Date.now() - start
return {
id: spec.id,
name: spec.name,
role: spec.role,
status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok',
latencyMs,
checkedAt,
}
} catch (err) {
const latencyMs = Date.now() - start
return {
id: spec.id,
name: spec.name,
role: spec.role,
status: 'bad',
latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null,
error: err instanceof Error ? err.message : String(err),
checkedAt,
}
}
}
// ── Primitives ─────────────────────────────────────────────────────────────
function tcpProbe(host: string, port: number, timeoutMs: number): Promise<void> {
return new Promise((resolve, reject) => {
const socket = net.createConnection({ host, port })
const timer = setTimeout(() => {
socket.destroy()
reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`))
}, timeoutMs)
socket.once('connect', () => {
clearTimeout(timer)
socket.end()
resolve()
})
socket.once('error', (err) => {
clearTimeout(timer)
reject(err)
})
})
}
async function httpProbe(url: string, timeoutMs: number): Promise<void> {
const controller = new AbortController()
const timer = setTimeout(() => controller.abort(), timeoutMs)
try {
const res = await fetch(url, { signal: controller.signal, method: 'GET' })
if (!res.ok) {
throw new Error(`HTTP ${res.status} from ${url}`)
}
} finally {
clearTimeout(timer)
}
}
function withTimeout<T>(p: Promise<T>, timeoutMs: number, msg: string): Promise<T> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error(msg)), timeoutMs)
p.then(
(v) => {
clearTimeout(timer)
resolve(v)
},
(e) => {
clearTimeout(timer)
reject(e)
},
)
})
}