feat(operator): live Infrastructure probes + honest split between deployed and planned

The Infrastructure page used to read from a mock fixture that lied two ways:
it listed services that aren't deployed (Jitsi, Zulip, Cloudflare, Object
Storage, Postmark) and showed hardcoded uptime/latency for the ones that
are. Now it shows truth from real probes plus a clearly-labelled "planned"
section for the rest.

Backend (services/platform-api):
- New src/health/ module — HealthService runs 9 probes in parallel with a
  1.5s timeout each:
    Stalwart  → TCP stalwart:8080
    OCIS      → HTTP GET ocis:9200/health
    Collabora → HTTP GET collabora:9980/hosting/discovery
    Authentik → HTTP GET authentik-server:9000/-/health/ready/
    Postgres  → TCP postgres:5432
    Mongo     → existing Mongoose connection.db.admin().ping()
    Redis     → TCP redis:6379
    Traefik   → TCP traefik:80
    Platform API → trivially ok (this code is running)
  Status thresholds: ok ≤500ms, warn 500–1500ms, bad on timeout/refuse.
- HealthController exposes GET /health/platform behind JwtAuthGuard, plus
  keeps the existing public GET /health for infra liveness checks.
- Moved the old src/health.controller.ts into the new module.

Frontend (apps/operator):
- /api/health/platform proxy forwards the operator's access token.
- Infrastructure page swaps SERVICES fixture for useFetch with 30s auto-
  refresh + a manual Refresh button. Cards show real status badge + real
  latency; uptime/error stay as em-dash with a "no probe history yet"
  tooltip until a Prometheus/event-log backend lands.
- Below the live grid, a "Planned · not deployed" section renders 5 dimmed
  cards (Jitsi, Zulip, simpledns.plus, Hetzner Object Storage, Postmark).
  simpledns.plus replaces the misnamed Cloudflare entry — we use
  simpledns.plus, not Cloudflare.
- Subtitle is now truthful: "8 / 9 services live · checked 2s ago".

Verified: stopped redis → card flipped to "down · getaddrinfo ENOTFOUND
redis", subtitle reflected 8/9, incident banner appeared. Restarted →
back to 9/9, banner gone.

SERVICES fixture stays in place for Overview's incident banner — replacing
that is a separate follow-up tied to the incident-management backend.
This commit is contained in:
Ronni Baslund
2026-05-24 18:47:38 +02:00
parent 9fac11e668
commit 77a09aaf77
8 changed files with 316 additions and 43 deletions
+103 -28
View File
@@ -1,15 +1,55 @@
<script setup lang="ts">
import { SERVICES, INCIDENT, type PlatformService } from '~/data/fixtures'
import { INCIDENT, PLANNED_SERVICES } from '~/data/fixtures'
const degradedCount = computed(() => SERVICES.filter((s) => s.status !== 'ok').length)
const incidentActive = computed(() => degradedCount.value > 0)
const { open: openIncident } = useIncidentModal()
function tone(s: PlatformService): 'ok' | 'warn' | 'bad' {
return s.status
// Shape returned by /api/health/platform on platform-api.
interface ProbeResult {
id: string
name: string
role: string
status: 'ok' | 'warn' | 'bad'
latencyMs: number | null
error?: string
checkedAt: string
}
function label(s: PlatformService) {
return s.status === 'ok' ? 'operational' : s.status === 'warn' ? 'degraded' : 'down'
const { data: probes, pending, refresh } = await useFetch<ProbeResult[]>('/api/health/platform', {
default: () => [],
})
// Auto-refresh: every 30s while this page is mounted.
const now = ref(Date.now())
let pollTimer: ReturnType<typeof setInterval> | null = null
let clockTimer: ReturnType<typeof setInterval> | null = null
onMounted(() => {
pollTimer = setInterval(() => refresh(), 30_000)
clockTimer = setInterval(() => { now.value = Date.now() }, 1_000)
})
onBeforeUnmount(() => {
if (pollTimer) clearInterval(pollTimer)
if (clockTimer) clearInterval(clockTimer)
})
const liveCount = computed(() => (probes.value ?? []).filter((p) => p.status === 'ok').length)
const totalCount = computed(() => (probes.value ?? []).length)
const degradedCount = computed(() => (probes.value ?? []).filter((p) => p.status !== 'ok').length)
const incidentActive = computed(() => degradedCount.value > 0)
const lastCheckedAt = computed(() => {
const first = probes.value?.[0]
return first ? new Date(first.checkedAt).getTime() : null
})
const checkedAgo = computed(() => {
if (!lastCheckedAt.value) return '—'
const s = Math.max(0, Math.floor((now.value - lastCheckedAt.value) / 1000))
return `${s}s ago`
})
function tone(p: ProbeResult): 'ok' | 'warn' | 'bad' {
return p.status
}
function label(p: ProbeResult) {
return p.status === 'ok' ? 'operational' : p.status === 'warn' ? 'degraded' : 'down'
}
</script>
@@ -18,17 +58,13 @@ function label(s: PlatformService) {
<PageHeader
eyebrow="Operations"
title="Infrastructure"
subtitle="Health of every service that makes up the Dezky platform."
:subtitle="`${liveCount} / ${totalCount} services live · checked ${checkedAgo}`"
>
<template #actions>
<UiButton variant="secondary" disabled>
<UiButton variant="secondary" :disabled="pending" @click="refresh()">
<template #leading><UiIcon name="chevDown" :size="13" /></template>
Refresh
</UiButton>
<UiButton variant="secondary" disabled>
<template #leading><UiIcon name="calendar" :size="13" /></template>
Schedule maintenance
</UiButton>
</template>
</PageHeader>
@@ -40,33 +76,63 @@ function label(s: PlatformService) {
</span>
<div class="body">
<div class="title">{{ INCIDENT.title }}</div>
<div class="sub">Started {{ INCIDENT.started }} · IC: {{ INCIDENT.ic }}</div>
<div class="sub">{{ degradedCount }} service(s) reporting non-ok status · IC: {{ INCIDENT.ic }}</div>
</div>
<UiButton variant="primary" @click="openIncident">Open incident</UiButton>
<UiButton variant="primary" disabled>Open incident</UiButton>
</div>
<Eyebrow class="section-head">Live · {{ totalCount }} services</Eyebrow>
<div class="grid">
<Card v-for="s in SERVICES" :key="s.id" :pad="0">
<Card v-for="p in probes" :key="p.id" :pad="0">
<div class="head">
<div>
<div class="name">{{ p.name }}</div>
<Mono dim>{{ p.role }}</Mono>
</div>
<Badge :tone="tone(p)" dot>{{ label(p) }}</Badge>
</div>
<div class="metrics">
<MetricCell label="uptime · 30d" value="—" :title="'no probe history yet'" />
<MetricCell
label="p95 latency"
:value="p.latencyMs !== null ? `${p.latencyMs}ms` : '—'"
:tone="p.latencyMs !== null && p.latencyMs > 300 ? 'warn' : undefined"
/>
<MetricCell label="error rate" value="—" :title="'no probe history yet'" />
</div>
<div class="foot">
<Mono dim>probed {{ checkedAgo }}</Mono>
<Mono v-if="p.status !== 'ok' && p.error" :class="['err', p.status]" :title="p.error">
{{ p.status === 'bad' ? 'down' : 'slow' }} · {{ p.error.slice(0, 32) }}
</Mono>
<Mono v-else dim>{{ p.status === 'ok' ? 'ok' : 'check details' }}</Mono>
</div>
</Card>
</div>
<Eyebrow class="section-head">Planned · {{ PLANNED_SERVICES.length }} services · not deployed</Eyebrow>
<div class="grid planned">
<Card v-for="s in PLANNED_SERVICES" :key="s.id" :pad="0">
<div class="head">
<div>
<div class="name">{{ s.name }}</div>
<Mono dim>{{ s.role }}</Mono>
</div>
<Badge :tone="tone(s)" dot>{{ label(s) }}</Badge>
<Badge tone="neutral" dot>not deployed</Badge>
</div>
<div class="metrics">
<MetricCell label="uptime · 30d" :value="`${s.uptime.toFixed(2)}%`" />
<MetricCell label="p95 latency" :value="`${s.p95}ms`" :tone="s.p95 > 300 ? 'warn' : undefined" />
<MetricCell label="error rate" :value="`${s.err.toFixed(3)}%`" :tone="s.err > 0.04 ? 'warn' : undefined" />
</div>
<div class="foot">
<Mono dim>last incident · {{ s.last }}</Mono>
<Mono dim>details </Mono>
<div class="planned-body">
<Mono dim>{{ s.note }}</Mono>
</div>
</Card>
</div>
<Mono dim class="note">// mock fixtures — wire up to Docker healthchecks + Prometheus in a follow-up</Mono>
<Mono dim class="note">
// probes live in services/platform-api/src/health/. uptime / error rate stay
em-dashed until a probe history (Prometheus, persisted event log) lands
see "Real observability" in NEXT-STEPS.md follow-ups
</Mono>
</div>
</div>
</template>
@@ -74,6 +140,8 @@ function label(s: PlatformService) {
<style scoped>
.stage { padding: 24px 40px 64px 40px; display: flex; flex-direction: column; gap: 16px; }
.section-head { display: block; padding: 6px 4px; }
.incident {
display: flex;
align-items: center;
@@ -103,6 +171,8 @@ function label(s: PlatformService) {
.sub { font-size: 12px; color: var(--text-mute); margin-top: 2px; }
.grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 12px; }
.grid.planned { opacity: 0.6; }
.head {
padding: 16px 18px 12px 18px;
border-bottom: 1px solid var(--border);
@@ -125,7 +195,12 @@ function label(s: PlatformService) {
border-top: 1px solid var(--border);
display: flex;
justify-content: space-between;
gap: 8px;
}
.err.bad { color: var(--bad); }
.err.warn { color: var(--warn); }
.planned-body { padding: 14px 18px; }
.note { display: block; padding: 4px 4px 0 4px; }
</style>