diff --git a/apps/operator/data/fixtures.ts b/apps/operator/data/fixtures.ts
index 96c77ec..18baace 100644
--- a/apps/operator/data/fixtures.ts
+++ b/apps/operator/data/fixtures.ts
@@ -108,6 +108,24 @@ export const OP_AUDIT: AuditEntry[] = [
{ id: 'op_8811', when: '09:30:00', actor: 'Anne Baslund', role: 'platform admin', action: 'tos.published', target: 'v2026.05 · all tenants', tenant: '—', ip: '10.0.4.18', tone: 'info' },
]
+// Services in the design that haven't been deployed yet. Surfaced as a
+// separate "Planned" section on the Infrastructure page so the operator sees
+// honest deployment state instead of a fake all-green grid.
+export interface PlannedService {
+ id: string
+ name: string
+ role: string
+ note: string
+}
+
+export const PLANNED_SERVICES: PlannedService[] = [
+ { id: 'jitsi', name: 'Jitsi', role: 'Video meetings', note: 'Lands with docker-compose.optional.yml (Phase 7)' },
+ { id: 'zulip', name: 'Zulip', role: 'Team chat', note: 'Lands with docker-compose.optional.yml (Phase 7)' },
+ { id: 'dns', name: 'simpledns.plus', role: 'DNS · authoritative', note: 'External SaaS · prod only' },
+ { id: 'objstore', name: 'Hetzner Object Storage', role: 'Files · S3 backend for OCIS', note: 'External · prod only' },
+ { id: 'smtp-out', name: 'Postmark', role: 'Outbound SMTP · transactional email', note: 'External SaaS · prod only' },
+]
+
export type NotificationKind = 'security' | 'user' | 'billing' | 'integration' | 'support' | 'signin'
export type NotificationTone = 'warn' | 'info' | 'neutral' | 'ok' | 'bad'
export interface NotificationItem {
diff --git a/apps/operator/pages/infrastructure.vue b/apps/operator/pages/infrastructure.vue
index 62af2df..ee49e18 100644
--- a/apps/operator/pages/infrastructure.vue
+++ b/apps/operator/pages/infrastructure.vue
@@ -1,15 +1,55 @@
@@ -18,17 +58,13 @@ function label(s: PlatformService) {
-
+
Refresh
-
-
- Schedule maintenance
-
@@ -40,33 +76,63 @@ function label(s: PlatformService) {
{{ INCIDENT.title }}
-
Started {{ INCIDENT.started }} · IC: {{ INCIDENT.ic }}
+
{{ degradedCount }} service(s) reporting non-ok status · IC: {{ INCIDENT.ic }}
- Open incident
+ Open incident
+ Live · {{ totalCount }} services
+
-
+
+
+
+
{{ p.name }}
+
{{ p.role }}
+
+
{{ label(p) }}
+
+
+
+
+
+
+
+
+
+
+ Planned · {{ PLANNED_SERVICES.length }} services · not deployed
+
+
+
{{ s.name }}
{{ s.role }}
-
{{ label(s) }}
+
not deployed
-
-
-
-
-
-
- // mock fixtures — wire up to Docker healthchecks + Prometheus in a follow-up
+
+ // probes live in services/platform-api/src/health/. uptime / error rate stay
+ em-dashed until a probe history (Prometheus, persisted event log) lands —
+ see "Real observability" in NEXT-STEPS.md follow-ups
+
@@ -74,6 +140,8 @@ function label(s: PlatformService) {
diff --git a/apps/operator/server/api/health/platform.get.ts b/apps/operator/server/api/health/platform.get.ts
new file mode 100644
index 0000000..9691e1b
--- /dev/null
+++ b/apps/operator/server/api/health/platform.get.ts
@@ -0,0 +1,3 @@
+import { platformApi } from '~~/server/utils/platform-api'
+
+export default defineEventHandler((event) => platformApi(event, '/health/platform'))
diff --git a/services/platform-api/src/app.module.ts b/services/platform-api/src/app.module.ts
index 63170b0..e159b8e 100644
--- a/services/platform-api/src/app.module.ts
+++ b/services/platform-api/src/app.module.ts
@@ -2,7 +2,7 @@ import { Module } from '@nestjs/common'
import { ConfigModule } from '@nestjs/config'
import { MongooseModule } from '@nestjs/mongoose'
import { AuthModule } from './auth/auth.module.js'
-import { HealthController } from './health.controller.js'
+import { HealthModule } from './health/health.module.js'
import { PartnersModule } from './partners/partners.module.js'
import { SeedModule } from './seed/seed.module.js'
import { SubscriptionsModule } from './subscriptions/subscriptions.module.js'
@@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js'
process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky',
),
AuthModule,
+ HealthModule,
TenantsModule,
PartnersModule,
UsersModule,
SubscriptionsModule,
SeedModule,
],
- controllers: [HealthController],
})
export class AppModule {}
diff --git a/services/platform-api/src/health.controller.ts b/services/platform-api/src/health.controller.ts
deleted file mode 100644
index c97d951..0000000
--- a/services/platform-api/src/health.controller.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-import { Controller, Get } from '@nestjs/common'
-
-@Controller('health')
-export class HealthController {
- @Get()
- check() {
- return {
- status: 'ok',
- service: 'dezky-platform-api',
- timestamp: new Date().toISOString(),
- }
- }
-}
diff --git a/services/platform-api/src/health/health.controller.ts b/services/platform-api/src/health/health.controller.ts
new file mode 100644
index 0000000..cd98bb9
--- /dev/null
+++ b/services/platform-api/src/health/health.controller.ts
@@ -0,0 +1,28 @@
+import { Controller, Get, UseGuards } from '@nestjs/common'
+import { JwtAuthGuard } from '../auth/jwt-auth.guard.js'
+import { HealthService } from './health.service.js'
+
+@Controller('health')
+export class HealthController {
+ constructor(private readonly health: HealthService) {}
+
+ // Public liveness probe — used by infra (Docker / k8s) to know the
+ // platform-api process is alive. Intentionally not behind auth.
+ @Get()
+ check() {
+ return {
+ status: 'ok',
+ service: 'dezky-platform-api',
+ timestamp: new Date().toISOString(),
+ }
+ }
+
+ // Aggregated probes of every neighbouring service in the stack. Behind
+ // JwtAuthGuard because we don't want this read by random unauthenticated
+ // clients — it leaks the topology of the deployment.
+ @Get('platform')
+ @UseGuards(JwtAuthGuard)
+ async platform() {
+ return this.health.probeAll()
+ }
+}
diff --git a/services/platform-api/src/health/health.module.ts b/services/platform-api/src/health/health.module.ts
new file mode 100644
index 0000000..2e1c19c
--- /dev/null
+++ b/services/platform-api/src/health/health.module.ts
@@ -0,0 +1,11 @@
+import { Module } from '@nestjs/common'
+import { AuthModule } from '../auth/auth.module.js'
+import { HealthController } from './health.controller.js'
+import { HealthService } from './health.service.js'
+
+@Module({
+ imports: [AuthModule],
+ controllers: [HealthController],
+ providers: [HealthService],
+})
+export class HealthModule {}
diff --git a/services/platform-api/src/health/health.service.ts b/services/platform-api/src/health/health.service.ts
new file mode 100644
index 0000000..d288f3c
--- /dev/null
+++ b/services/platform-api/src/health/health.service.ts
@@ -0,0 +1,151 @@
+// Live health probes for the services we expect to find in the Dezky stack.
+// Hostnames + ports are the compose service names from
+// infrastructure/docker-compose/docker-compose.yml. When we move to k3s,
+// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...).
+
+import { Injectable } from '@nestjs/common'
+import { InjectConnection } from '@nestjs/mongoose'
+import type { Connection } from 'mongoose'
+import * as net from 'node:net'
+
+export type ProbeStatus = 'ok' | 'warn' | 'bad'
+
+export interface ProbeResult {
+ id: string
+ name: string
+ role: string
+ status: ProbeStatus
+ latencyMs: number | null
+ error?: string
+ checkedAt: string
+}
+
+interface ProbeSpec {
+ id: string
+ name: string
+ role: string
+ // The probe returns either nothing (success) or throws / returns a reason
+ // (failure). Latency is measured around the call by the runner.
+ run(timeoutMs: number): Promise
+}
+
+// Generous-ish per-probe budget. Probes run in parallel, so the total
+// /health/platform response should be ~timeout regardless of count.
+const PROBE_TIMEOUT_MS = 1500
+const WARN_THRESHOLD_MS = 500
+
+@Injectable()
+export class HealthService {
+ constructor(@InjectConnection() private readonly mongo: Connection) {}
+
+ async probeAll(): Promise {
+ const probes: ProbeSpec[] = [
+ { id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) },
+ { id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) },
+ { id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) },
+ { id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) },
+ { id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) },
+ { id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) },
+ { id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) },
+ { id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) },
+ // platform-api itself: this code is running, so it's trivially ok.
+ { id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } },
+ ]
+
+ const checkedAt = new Date().toISOString()
+ const results = await Promise.all(probes.map((p) => run(p, checkedAt)))
+ return results
+ }
+
+ private async mongoPing(timeoutMs: number): Promise {
+ if (this.mongo.readyState !== 1) {
+ throw new Error(`mongoose readyState=${this.mongo.readyState}`)
+ }
+ await withTimeout(
+ // db is defined once the connection is open; the readyState guard above
+ // ensures that.
+ this.mongo.db!.admin().ping(),
+ timeoutMs,
+ 'mongo ping timed out',
+ )
+ }
+}
+
+// ── Runner ─────────────────────────────────────────────────────────────────
+
+async function run(spec: ProbeSpec, checkedAt: string): Promise {
+ const start = Date.now()
+ try {
+ await spec.run(PROBE_TIMEOUT_MS)
+ const latencyMs = Date.now() - start
+ return {
+ id: spec.id,
+ name: spec.name,
+ role: spec.role,
+ status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok',
+ latencyMs,
+ checkedAt,
+ }
+ } catch (err) {
+ const latencyMs = Date.now() - start
+ return {
+ id: spec.id,
+ name: spec.name,
+ role: spec.role,
+ status: 'bad',
+ latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null,
+ error: err instanceof Error ? err.message : String(err),
+ checkedAt,
+ }
+ }
+}
+
+// ── Primitives ─────────────────────────────────────────────────────────────
+
+function tcpProbe(host: string, port: number, timeoutMs: number): Promise {
+ return new Promise((resolve, reject) => {
+ const socket = net.createConnection({ host, port })
+ const timer = setTimeout(() => {
+ socket.destroy()
+ reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`))
+ }, timeoutMs)
+ socket.once('connect', () => {
+ clearTimeout(timer)
+ socket.end()
+ resolve()
+ })
+ socket.once('error', (err) => {
+ clearTimeout(timer)
+ reject(err)
+ })
+ })
+}
+
+async function httpProbe(url: string, timeoutMs: number): Promise {
+ const controller = new AbortController()
+ const timer = setTimeout(() => controller.abort(), timeoutMs)
+ try {
+ const res = await fetch(url, { signal: controller.signal, method: 'GET' })
+ if (!res.ok) {
+ throw new Error(`HTTP ${res.status} from ${url}`)
+ }
+ } finally {
+ clearTimeout(timer)
+ }
+}
+
+function withTimeout(p: Promise, timeoutMs: number, msg: string): Promise {
+ return new Promise((resolve, reject) => {
+ const timer = setTimeout(() => reject(new Error(msg)), timeoutMs)
+ p.then(
+ (v) => {
+ clearTimeout(timer)
+ resolve(v)
+ },
+ (e) => {
+ clearTimeout(timer)
+ reject(e)
+ },
+ )
+ })
+}