diff --git a/apps/operator/data/fixtures.ts b/apps/operator/data/fixtures.ts index 96c77ec..18baace 100644 --- a/apps/operator/data/fixtures.ts +++ b/apps/operator/data/fixtures.ts @@ -108,6 +108,24 @@ export const OP_AUDIT: AuditEntry[] = [ { id: 'op_8811', when: '09:30:00', actor: 'Anne Baslund', role: 'platform admin', action: 'tos.published', target: 'v2026.05 · all tenants', tenant: '—', ip: '10.0.4.18', tone: 'info' }, ] +// Services in the design that haven't been deployed yet. Surfaced as a +// separate "Planned" section on the Infrastructure page so the operator sees +// honest deployment state instead of a fake all-green grid. +export interface PlannedService { + id: string + name: string + role: string + note: string +} + +export const PLANNED_SERVICES: PlannedService[] = [ + { id: 'jitsi', name: 'Jitsi', role: 'Video meetings', note: 'Lands with docker-compose.optional.yml (Phase 7)' }, + { id: 'zulip', name: 'Zulip', role: 'Team chat', note: 'Lands with docker-compose.optional.yml (Phase 7)' }, + { id: 'dns', name: 'simpledns.plus', role: 'DNS · authoritative', note: 'External SaaS · prod only' }, + { id: 'objstore', name: 'Hetzner Object Storage', role: 'Files · S3 backend for OCIS', note: 'External · prod only' }, + { id: 'smtp-out', name: 'Postmark', role: 'Outbound SMTP · transactional email', note: 'External SaaS · prod only' }, +] + export type NotificationKind = 'security' | 'user' | 'billing' | 'integration' | 'support' | 'signin' export type NotificationTone = 'warn' | 'info' | 'neutral' | 'ok' | 'bad' export interface NotificationItem { diff --git a/apps/operator/pages/infrastructure.vue b/apps/operator/pages/infrastructure.vue index 62af2df..ee49e18 100644 --- a/apps/operator/pages/infrastructure.vue +++ b/apps/operator/pages/infrastructure.vue @@ -1,15 +1,55 @@ @@ -18,17 +58,13 @@ function label(s: PlatformService) { @@ -40,33 +76,63 @@ function label(s: PlatformService) {
{{ INCIDENT.title }}
-
Started {{ INCIDENT.started }} · IC: {{ INCIDENT.ic }}
+
{{ degradedCount }} service(s) reporting non-ok status · IC: {{ INCIDENT.ic }}
- Open incident + Open incident + Live · {{ totalCount }} services +
- + +
+
+
{{ p.name }}
+ {{ p.role }} +
+ {{ label(p) }} +
+
+ + + +
+
+ probed {{ checkedAgo }} + + {{ p.status === 'bad' ? 'down' : 'slow' }} · {{ p.error.slice(0, 32) }} + + {{ p.status === 'ok' ? 'ok' : 'check details' }} +
+
+
+ + Planned · {{ PLANNED_SERVICES.length }} services · not deployed + +
+
{{ s.name }}
{{ s.role }}
- {{ label(s) }} + not deployed
-
- - - -
-
- last incident · {{ s.last }} - details → +
+ {{ s.note }}
- // mock fixtures — wire up to Docker healthchecks + Prometheus in a follow-up + + // probes live in services/platform-api/src/health/. uptime / error rate stay + em-dashed until a probe history (Prometheus, persisted event log) lands — + see "Real observability" in NEXT-STEPS.md follow-ups +
@@ -74,6 +140,8 @@ function label(s: PlatformService) { diff --git a/apps/operator/server/api/health/platform.get.ts b/apps/operator/server/api/health/platform.get.ts new file mode 100644 index 0000000..9691e1b --- /dev/null +++ b/apps/operator/server/api/health/platform.get.ts @@ -0,0 +1,3 @@ +import { platformApi } from '~~/server/utils/platform-api' + +export default defineEventHandler((event) => platformApi(event, '/health/platform')) diff --git a/services/platform-api/src/app.module.ts b/services/platform-api/src/app.module.ts index 63170b0..e159b8e 100644 --- a/services/platform-api/src/app.module.ts +++ b/services/platform-api/src/app.module.ts @@ -2,7 +2,7 @@ import { Module } from '@nestjs/common' import { ConfigModule } from '@nestjs/config' import { MongooseModule } from '@nestjs/mongoose' import { AuthModule } from './auth/auth.module.js' -import { HealthController } from './health.controller.js' +import { HealthModule } from './health/health.module.js' import { PartnersModule } from './partners/partners.module.js' import { SeedModule } from './seed/seed.module.js' import { SubscriptionsModule } from './subscriptions/subscriptions.module.js' @@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js' process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky', ), AuthModule, + HealthModule, TenantsModule, PartnersModule, UsersModule, SubscriptionsModule, SeedModule, ], - controllers: [HealthController], }) export class AppModule {} diff --git a/services/platform-api/src/health.controller.ts b/services/platform-api/src/health.controller.ts deleted file mode 100644 index c97d951..0000000 --- a/services/platform-api/src/health.controller.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { Controller, Get } from '@nestjs/common' - -@Controller('health') -export class HealthController { - @Get() - check() { - return { - status: 'ok', - service: 'dezky-platform-api', - timestamp: new Date().toISOString(), - } - } -} diff --git a/services/platform-api/src/health/health.controller.ts b/services/platform-api/src/health/health.controller.ts new file mode 100644 index 0000000..cd98bb9 --- /dev/null +++ b/services/platform-api/src/health/health.controller.ts @@ -0,0 +1,28 @@ +import { Controller, Get, UseGuards } from '@nestjs/common' +import { JwtAuthGuard } from '../auth/jwt-auth.guard.js' +import { HealthService } from './health.service.js' + +@Controller('health') +export class HealthController { + constructor(private readonly health: HealthService) {} + + // Public liveness probe — used by infra (Docker / k8s) to know the + // platform-api process is alive. Intentionally not behind auth. + @Get() + check() { + return { + status: 'ok', + service: 'dezky-platform-api', + timestamp: new Date().toISOString(), + } + } + + // Aggregated probes of every neighbouring service in the stack. Behind + // JwtAuthGuard because we don't want this read by random unauthenticated + // clients — it leaks the topology of the deployment. + @Get('platform') + @UseGuards(JwtAuthGuard) + async platform() { + return this.health.probeAll() + } +} diff --git a/services/platform-api/src/health/health.module.ts b/services/platform-api/src/health/health.module.ts new file mode 100644 index 0000000..2e1c19c --- /dev/null +++ b/services/platform-api/src/health/health.module.ts @@ -0,0 +1,11 @@ +import { Module } from '@nestjs/common' +import { AuthModule } from '../auth/auth.module.js' +import { HealthController } from './health.controller.js' +import { HealthService } from './health.service.js' + +@Module({ + imports: [AuthModule], + controllers: [HealthController], + providers: [HealthService], +}) +export class HealthModule {} diff --git a/services/platform-api/src/health/health.service.ts b/services/platform-api/src/health/health.service.ts new file mode 100644 index 0000000..d288f3c --- /dev/null +++ b/services/platform-api/src/health/health.service.ts @@ -0,0 +1,151 @@ +// Live health probes for the services we expect to find in the Dezky stack. +// Hostnames + ports are the compose service names from +// infrastructure/docker-compose/docker-compose.yml. When we move to k3s, +// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...). + +import { Injectable } from '@nestjs/common' +import { InjectConnection } from '@nestjs/mongoose' +import type { Connection } from 'mongoose' +import * as net from 'node:net' + +export type ProbeStatus = 'ok' | 'warn' | 'bad' + +export interface ProbeResult { + id: string + name: string + role: string + status: ProbeStatus + latencyMs: number | null + error?: string + checkedAt: string +} + +interface ProbeSpec { + id: string + name: string + role: string + // The probe returns either nothing (success) or throws / returns a reason + // (failure). Latency is measured around the call by the runner. + run(timeoutMs: number): Promise +} + +// Generous-ish per-probe budget. Probes run in parallel, so the total +// /health/platform response should be ~timeout regardless of count. +const PROBE_TIMEOUT_MS = 1500 +const WARN_THRESHOLD_MS = 500 + +@Injectable() +export class HealthService { + constructor(@InjectConnection() private readonly mongo: Connection) {} + + async probeAll(): Promise { + const probes: ProbeSpec[] = [ + { id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) }, + { id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) }, + { id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) }, + { id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) }, + { id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) }, + { id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) }, + { id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) }, + { id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) }, + // platform-api itself: this code is running, so it's trivially ok. + { id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } }, + ] + + const checkedAt = new Date().toISOString() + const results = await Promise.all(probes.map((p) => run(p, checkedAt))) + return results + } + + private async mongoPing(timeoutMs: number): Promise { + if (this.mongo.readyState !== 1) { + throw new Error(`mongoose readyState=${this.mongo.readyState}`) + } + await withTimeout( + // db is defined once the connection is open; the readyState guard above + // ensures that. + this.mongo.db!.admin().ping(), + timeoutMs, + 'mongo ping timed out', + ) + } +} + +// ── Runner ───────────────────────────────────────────────────────────────── + +async function run(spec: ProbeSpec, checkedAt: string): Promise { + const start = Date.now() + try { + await spec.run(PROBE_TIMEOUT_MS) + const latencyMs = Date.now() - start + return { + id: spec.id, + name: spec.name, + role: spec.role, + status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok', + latencyMs, + checkedAt, + } + } catch (err) { + const latencyMs = Date.now() - start + return { + id: spec.id, + name: spec.name, + role: spec.role, + status: 'bad', + latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null, + error: err instanceof Error ? err.message : String(err), + checkedAt, + } + } +} + +// ── Primitives ───────────────────────────────────────────────────────────── + +function tcpProbe(host: string, port: number, timeoutMs: number): Promise { + return new Promise((resolve, reject) => { + const socket = net.createConnection({ host, port }) + const timer = setTimeout(() => { + socket.destroy() + reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`)) + }, timeoutMs) + socket.once('connect', () => { + clearTimeout(timer) + socket.end() + resolve() + }) + socket.once('error', (err) => { + clearTimeout(timer) + reject(err) + }) + }) +} + +async function httpProbe(url: string, timeoutMs: number): Promise { + const controller = new AbortController() + const timer = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { signal: controller.signal, method: 'GET' }) + if (!res.ok) { + throw new Error(`HTTP ${res.status} from ${url}`) + } + } finally { + clearTimeout(timer) + } +} + +function withTimeout(p: Promise, timeoutMs: number, msg: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error(msg)), timeoutMs) + p.then( + (v) => { + clearTimeout(timer) + resolve(v) + }, + (e) => { + clearTimeout(timer) + reject(e) + }, + ) + }) +}