feat(operator): live Infrastructure probes + honest split between deployed and planned
The Infrastructure page used to read from a mock fixture that lied two ways:
it listed services that aren't deployed (Jitsi, Zulip, Cloudflare, Object
Storage, Postmark) and showed hardcoded uptime/latency for the ones that
are. Now it shows truth from real probes plus a clearly-labelled "planned"
section for the rest.
Backend (services/platform-api):
- New src/health/ module — HealthService runs 9 probes in parallel with a
1.5s timeout each:
Stalwart → TCP stalwart:8080
OCIS → HTTP GET ocis:9200/health
Collabora → HTTP GET collabora:9980/hosting/discovery
Authentik → HTTP GET authentik-server:9000/-/health/ready/
Postgres → TCP postgres:5432
Mongo → existing Mongoose connection.db.admin().ping()
Redis → TCP redis:6379
Traefik → TCP traefik:80
Platform API → trivially ok (this code is running)
Status thresholds: ok ≤500ms, warn 500–1500ms, bad on timeout/refuse.
- HealthController exposes GET /health/platform behind JwtAuthGuard, plus
keeps the existing public GET /health for infra liveness checks.
- Moved the old src/health.controller.ts into the new module.
Frontend (apps/operator):
- /api/health/platform proxy forwards the operator's access token.
- Infrastructure page swaps SERVICES fixture for useFetch with 30s auto-
refresh + a manual Refresh button. Cards show real status badge + real
latency; uptime/error stay as em-dash with a "no probe history yet"
tooltip until a Prometheus/event-log backend lands.
- Below the live grid, a "Planned · not deployed" section renders 5 dimmed
cards (Jitsi, Zulip, simpledns.plus, Hetzner Object Storage, Postmark).
simpledns.plus replaces the misnamed Cloudflare entry — we use
simpledns.plus, not Cloudflare.
- Subtitle is now truthful: "8 / 9 services live · checked 2s ago".
Verified: stopped redis → card flipped to "down · getaddrinfo ENOTFOUND
redis", subtitle reflected 8/9, incident banner appeared. Restarted →
back to 9/9, banner gone.
SERVICES fixture stays in place for Overview's incident banner — replacing
that is a separate follow-up tied to the incident-management backend.
This commit is contained in:
@@ -2,7 +2,7 @@ import { Module } from '@nestjs/common'
|
||||
import { ConfigModule } from '@nestjs/config'
|
||||
import { MongooseModule } from '@nestjs/mongoose'
|
||||
import { AuthModule } from './auth/auth.module.js'
|
||||
import { HealthController } from './health.controller.js'
|
||||
import { HealthModule } from './health/health.module.js'
|
||||
import { PartnersModule } from './partners/partners.module.js'
|
||||
import { SeedModule } from './seed/seed.module.js'
|
||||
import { SubscriptionsModule } from './subscriptions/subscriptions.module.js'
|
||||
@@ -16,12 +16,12 @@ import { UsersModule } from './users/users.module.js'
|
||||
process.env.MONGODB_URI ?? 'mongodb://localhost:27017/dezky',
|
||||
),
|
||||
AuthModule,
|
||||
HealthModule,
|
||||
TenantsModule,
|
||||
PartnersModule,
|
||||
UsersModule,
|
||||
SubscriptionsModule,
|
||||
SeedModule,
|
||||
],
|
||||
controllers: [HealthController],
|
||||
})
|
||||
export class AppModule {}
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
import { Controller, Get } from '@nestjs/common'
|
||||
|
||||
@Controller('health')
|
||||
export class HealthController {
|
||||
@Get()
|
||||
check() {
|
||||
return {
|
||||
status: 'ok',
|
||||
service: 'dezky-platform-api',
|
||||
timestamp: new Date().toISOString(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
import { Controller, Get, UseGuards } from '@nestjs/common'
|
||||
import { JwtAuthGuard } from '../auth/jwt-auth.guard.js'
|
||||
import { HealthService } from './health.service.js'
|
||||
|
||||
@Controller('health')
|
||||
export class HealthController {
|
||||
constructor(private readonly health: HealthService) {}
|
||||
|
||||
// Public liveness probe — used by infra (Docker / k8s) to know the
|
||||
// platform-api process is alive. Intentionally not behind auth.
|
||||
@Get()
|
||||
check() {
|
||||
return {
|
||||
status: 'ok',
|
||||
service: 'dezky-platform-api',
|
||||
timestamp: new Date().toISOString(),
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregated probes of every neighbouring service in the stack. Behind
|
||||
// JwtAuthGuard because we don't want this read by random unauthenticated
|
||||
// clients — it leaks the topology of the deployment.
|
||||
@Get('platform')
|
||||
@UseGuards(JwtAuthGuard)
|
||||
async platform() {
|
||||
return this.health.probeAll()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
import { Module } from '@nestjs/common'
|
||||
import { AuthModule } from '../auth/auth.module.js'
|
||||
import { HealthController } from './health.controller.js'
|
||||
import { HealthService } from './health.service.js'
|
||||
|
||||
@Module({
|
||||
imports: [AuthModule],
|
||||
controllers: [HealthController],
|
||||
providers: [HealthService],
|
||||
})
|
||||
export class HealthModule {}
|
||||
@@ -0,0 +1,151 @@
|
||||
// Live health probes for the services we expect to find in the Dezky stack.
|
||||
// Hostnames + ports are the compose service names from
|
||||
// infrastructure/docker-compose/docker-compose.yml. When we move to k3s,
|
||||
// swap these for in-cluster service DNS (e.g. authentik.dezky.svc...).
|
||||
|
||||
import { Injectable } from '@nestjs/common'
|
||||
import { InjectConnection } from '@nestjs/mongoose'
|
||||
import type { Connection } from 'mongoose'
|
||||
import * as net from 'node:net'
|
||||
|
||||
export type ProbeStatus = 'ok' | 'warn' | 'bad'
|
||||
|
||||
export interface ProbeResult {
|
||||
id: string
|
||||
name: string
|
||||
role: string
|
||||
status: ProbeStatus
|
||||
latencyMs: number | null
|
||||
error?: string
|
||||
checkedAt: string
|
||||
}
|
||||
|
||||
interface ProbeSpec {
|
||||
id: string
|
||||
name: string
|
||||
role: string
|
||||
// The probe returns either nothing (success) or throws / returns a reason
|
||||
// (failure). Latency is measured around the call by the runner.
|
||||
run(timeoutMs: number): Promise<void>
|
||||
}
|
||||
|
||||
// Generous-ish per-probe budget. Probes run in parallel, so the total
|
||||
// /health/platform response should be ~timeout regardless of count.
|
||||
const PROBE_TIMEOUT_MS = 1500
|
||||
const WARN_THRESHOLD_MS = 500
|
||||
|
||||
@Injectable()
|
||||
export class HealthService {
|
||||
constructor(@InjectConnection() private readonly mongo: Connection) {}
|
||||
|
||||
async probeAll(): Promise<ProbeResult[]> {
|
||||
const probes: ProbeSpec[] = [
|
||||
{ id: 'mail', name: 'Stalwart', role: 'Mail · IMAP/JMAP/SMTP', run: () => tcpProbe('stalwart', 8080, PROBE_TIMEOUT_MS) },
|
||||
{ id: 'files', name: 'OCIS', role: 'Files · OwnCloud Infinite',run: () => httpProbe('http://ocis:9200/health', PROBE_TIMEOUT_MS) },
|
||||
{ id: 'office', name: 'Collabora', role: 'Office editing · WOPI', run: () => httpProbe('http://collabora:9980/hosting/discovery', PROBE_TIMEOUT_MS) },
|
||||
{ id: 'auth', name: 'Authentik', role: 'Identity · SSO · MFA', run: () => httpProbe('http://authentik-server:9000/-/health/ready/', PROBE_TIMEOUT_MS) },
|
||||
{ id: 'pg', name: 'PostgreSQL', role: 'Authentik + OCIS database',run: () => tcpProbe('postgres', 5432, PROBE_TIMEOUT_MS) },
|
||||
{ id: 'mongo', name: 'MongoDB', role: 'Platform application data',run: () => this.mongoPing(PROBE_TIMEOUT_MS) },
|
||||
{ id: 'redis', name: 'Redis', role: 'Cache + session store', run: () => tcpProbe('redis', 6379, PROBE_TIMEOUT_MS) },
|
||||
{ id: 'proxy', name: 'Traefik', role: 'Reverse proxy · TLS', run: () => tcpProbe('traefik', 80, PROBE_TIMEOUT_MS) },
|
||||
// platform-api itself: this code is running, so it's trivially ok.
|
||||
{ id: 'api', name: 'Platform API', role: 'Control plane', run: async () => { /* always ok */ } },
|
||||
]
|
||||
|
||||
const checkedAt = new Date().toISOString()
|
||||
const results = await Promise.all(probes.map((p) => run(p, checkedAt)))
|
||||
return results
|
||||
}
|
||||
|
||||
private async mongoPing(timeoutMs: number): Promise<void> {
|
||||
if (this.mongo.readyState !== 1) {
|
||||
throw new Error(`mongoose readyState=${this.mongo.readyState}`)
|
||||
}
|
||||
await withTimeout(
|
||||
// db is defined once the connection is open; the readyState guard above
|
||||
// ensures that.
|
||||
this.mongo.db!.admin().ping(),
|
||||
timeoutMs,
|
||||
'mongo ping timed out',
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Runner ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async function run(spec: ProbeSpec, checkedAt: string): Promise<ProbeResult> {
|
||||
const start = Date.now()
|
||||
try {
|
||||
await spec.run(PROBE_TIMEOUT_MS)
|
||||
const latencyMs = Date.now() - start
|
||||
return {
|
||||
id: spec.id,
|
||||
name: spec.name,
|
||||
role: spec.role,
|
||||
status: latencyMs > WARN_THRESHOLD_MS ? 'warn' : 'ok',
|
||||
latencyMs,
|
||||
checkedAt,
|
||||
}
|
||||
} catch (err) {
|
||||
const latencyMs = Date.now() - start
|
||||
return {
|
||||
id: spec.id,
|
||||
name: spec.name,
|
||||
role: spec.role,
|
||||
status: 'bad',
|
||||
latencyMs: latencyMs < PROBE_TIMEOUT_MS ? latencyMs : null,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
checkedAt,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Primitives ─────────────────────────────────────────────────────────────
|
||||
|
||||
function tcpProbe(host: string, port: number, timeoutMs: number): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const socket = net.createConnection({ host, port })
|
||||
const timer = setTimeout(() => {
|
||||
socket.destroy()
|
||||
reject(new Error(`tcp ${host}:${port} timed out after ${timeoutMs}ms`))
|
||||
}, timeoutMs)
|
||||
socket.once('connect', () => {
|
||||
clearTimeout(timer)
|
||||
socket.end()
|
||||
resolve()
|
||||
})
|
||||
socket.once('error', (err) => {
|
||||
clearTimeout(timer)
|
||||
reject(err)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async function httpProbe(url: string, timeoutMs: number): Promise<void> {
|
||||
const controller = new AbortController()
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs)
|
||||
try {
|
||||
const res = await fetch(url, { signal: controller.signal, method: 'GET' })
|
||||
if (!res.ok) {
|
||||
throw new Error(`HTTP ${res.status} from ${url}`)
|
||||
}
|
||||
} finally {
|
||||
clearTimeout(timer)
|
||||
}
|
||||
}
|
||||
|
||||
function withTimeout<T>(p: Promise<T>, timeoutMs: number, msg: string): Promise<T> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => reject(new Error(msg)), timeoutMs)
|
||||
p.then(
|
||||
(v) => {
|
||||
clearTimeout(timer)
|
||||
resolve(v)
|
||||
},
|
||||
(e) => {
|
||||
clearTimeout(timer)
|
||||
reject(e)
|
||||
},
|
||||
)
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user