feat(scheduling): retry calendar writes for pending bookings

A failed Stalwart calendar write during confirmation no longer deletes the
booking + SlotLock. The booking stays 'pending' with its lock retained, and a
new @Cron worker (every 2 min, max 5 attempts by default) re-drives the write:
on success it promotes to 'confirmed' and sends the confirmation email; after
the cap it moves to the terminal 'calendar_failed' state and releases the lock.

Tracks calendarWriteAttempts + lastCalendarError on the Booking. The public
confirm endpoint still throws 503 on a failed first write (preserving the DoD:
never surface a confirmed booking without a calendar event); the pending row is
left for the background retry to finish.
This commit is contained in:
Ronni Baslund
2026-06-07 08:49:53 +02:00
parent 9e1defa946
commit 2cb13a1a14
4 changed files with 163 additions and 24 deletions
@@ -10,10 +10,11 @@ import { ConfigService } from '@nestjs/config'
import { InjectModel } from '@nestjs/mongoose' import { InjectModel } from '@nestjs/mongoose'
import { randomBytes, randomUUID } from 'node:crypto' import { randomBytes, randomUUID } from 'node:crypto'
import { Model, Types } from 'mongoose' import { Model, Types } from 'mongoose'
import { Booking, BookingDocument } from '../../schemas/booking.schema.js' import { Booking, BookingDocument, BookingStatus } from '../../schemas/booking.schema.js'
import { EventTypeDocument } from '../../schemas/event-type.schema.js' import { EventType, EventTypeDocument } from '../../schemas/event-type.schema.js'
import { HostDocument } from '../../schemas/scheduling-host.schema.js' import { Host, HostDocument } from '../../schemas/scheduling-host.schema.js'
import { SlotLock, SlotLockDocument } from '../../schemas/slot-lock.schema.js' import { SlotLock, SlotLockDocument } from '../../schemas/slot-lock.schema.js'
import { Tenant, TenantDocument } from '../../schemas/tenant.schema.js'
import { confirmationEmail, cancellationEmail } from '../email/booking-templates.js' import { confirmationEmail, cancellationEmail } from '../email/booking-templates.js'
import { buildBookingIcs } from '../email/ics.js' import { buildBookingIcs } from '../email/ics.js'
import { JmapMailer } from '../email/jmap-mailer.service.js' import { JmapMailer } from '../email/jmap-mailer.service.js'
@@ -56,6 +57,9 @@ export class BookingsService {
constructor( constructor(
@InjectModel(Booking.name) private readonly bookingModel: Model<BookingDocument>, @InjectModel(Booking.name) private readonly bookingModel: Model<BookingDocument>,
@InjectModel(SlotLock.name) private readonly lockModel: Model<SlotLockDocument>, @InjectModel(SlotLock.name) private readonly lockModel: Model<SlotLockDocument>,
@InjectModel(Tenant.name) private readonly tenantModel: Model<TenantDocument>,
@InjectModel(Host.name) private readonly hostModel: Model<HostDocument>,
@InjectModel(EventType.name) private readonly eventTypeModel: Model<EventTypeDocument>,
private readonly slots: SlotService, private readonly slots: SlotService,
private readonly provisioner: CredentialProvisioner, private readonly provisioner: CredentialProvisioner,
private readonly gateway: JmapCalendarGateway, private readonly gateway: JmapCalendarGateway,
@@ -166,38 +170,117 @@ export class BookingsService {
} }
// (d) Write to the host's Stalwart calendar; promote to confirmed on success. // (d) Write to the host's Stalwart calendar; promote to confirmed on success.
let access: HostCalendarAccess // On failure we DO NOT delete the booking/lock anymore: the booking stays
// 'pending' with its SlotLock held so the retry worker (§8.2.4) can re-drive
// the write. We still surface 503 to the synchronous caller so the attendee
// is told to retry immediately — but the slot is now durably reserved and
// will be promoted-and-emailed (or terminally released) in the background.
//
// Public-confirm behavior (DoD: never silently confirm without a calendar
// event): the public endpoint keeps throwing ServiceUnavailable on a failed
// first write rather than returning a 'pending' booking, so the UI never
// tells the attendee "confirmed" before a calendar event exists. The pending
// booking lives on for the background retry to finish the job.
const written = await this.attemptCalendarWrite(ctx, booking)
if (!written) {
throw new ServiceUnavailableException('Could not complete the booking on the calendar — please try again.')
}
return booking
}
/**
* Drives the Stalwart calendar write for a pending booking and, on success,
* promotes it to 'confirmed' and fires the branded confirmation email.
*
* On failure it increments `calendarWriteAttempts`, records `lastCalendarError`
* and leaves the booking 'pending' with its SlotLock intact for a later retry.
* Returns true iff the booking is now confirmed. The `calendarEventUid` is
* generated up-front and reused across attempts, so a retry after a partial
* failure is idempotent (§9).
*/
private async attemptCalendarWrite(ctx: BookingContext, booking: BookingDocument): Promise<boolean> {
const { host, eventType } = ctx
try { try {
access = await this.provisioner.resolveAccess(host) const access = await this.provisioner.resolveAccess(host)
const { id } = await this.gateway.createEvent(access, { const { id } = await this.gateway.createEvent(access, {
uid: calendarEventUid, uid: booking.calendarEventUid,
title: eventType.title, title: eventType.title,
description: input.attendeeNotes, description: booking.attendeeNotes,
startUtc, startUtc: booking.startUtc,
endUtc, endUtc: booking.endUtc,
hostTimezone: host.timezone, hostTimezone: host.timezone,
location: location.url, location: booking.locationUrl,
hostEmail: host.email, hostEmail: host.email,
attendeeName: input.attendeeName, attendeeName: booking.attendeeName,
attendeeEmail: input.attendeeEmail, attendeeEmail: booking.attendeeEmail,
}) })
booking.calendarEventId = id booking.calendarEventId = id
booking.status = 'confirmed' booking.status = 'confirmed'
await booking.save() await booking.save()
// Branded confirmation email — best-effort (booking already valid).
this.sendEmail(ctx, booking, access, 'confirmation').catch((e) =>
this.logger.warn(`Confirmation email failed for ${booking.attendeeEmail}: ${e.message}`),
)
return true
} catch (err) { } catch (err) {
// Compensate: never leave a confirmed-looking booking with no calendar event. booking.calendarWriteAttempts = (booking.calendarWriteAttempts ?? 0) + 1
await this.lockModel.deleteOne({ hostId: host._id, startUtc, bookingId: booking._id }).exec() booking.lastCalendarError = (err as Error).message
await this.bookingModel.deleteOne({ _id: booking._id }).exec() await booking.save().catch(() => undefined)
this.logger.error(`Calendar write failed for ${host.email}: ${(err as Error).message}`) this.logger.error(
throw new ServiceUnavailableException('Could not complete the booking on the calendar — please try again.') `Calendar write failed for ${host.email} (booking ${booking._id}, attempt ${booking.calendarWriteAttempts}): ${(err as Error).message}`,
)
return false
}
}
// ── Retry queue (§8.2.4) ─────────────────────────────────────────────────────
/**
* Re-drive the calendar write for a single pending booking. On success the
* booking is promoted to confirmed and the confirmation email is sent. Once
* attempts reach `maxAttempts` the booking moves to the terminal
* 'calendar_failed' state and its SlotLock is released so the slot frees up.
* Returns the booking's resulting status. Invoked by the retry worker.
*/
async retryPendingCalendarWrite(booking: BookingDocument, maxAttempts: number): Promise<BookingStatus> {
const tenant = await this.tenantModel.findById(booking.tenantId).exec()
const host = await this.hostModel.findById(booking.hostId).exec()
const eventType = await this.eventTypeModel.findById(booking.eventTypeId).exec()
if (!tenant || !host || !eventType) {
this.logger.warn(`Retry skipped — missing tenant/host/eventType for booking ${booking._id}`)
return booking.status
} }
// (e) Branded confirmation email — best-effort (booking already valid). const ctx: BookingContext = {
this.sendEmail(ctx, booking, access, 'confirmation').catch((e) => tenant: { _id: tenant._id, slug: tenant.slug, name: tenant.name, brandColor: tenant.brandColor },
this.logger.warn(`Confirmation email failed for ${booking.attendeeEmail}: ${e.message}`), host,
) eventType,
}
return booking const ok = await this.attemptCalendarWrite(ctx, booking)
if (ok) return booking.status
if (booking.calendarWriteAttempts >= maxAttempts) {
booking.status = 'calendar_failed'
await booking.save().catch(() => undefined)
await this.lockModel
.deleteOne({ hostId: booking.hostId, startUtc: booking.startUtc, bookingId: booking._id })
.exec()
.catch(() => undefined)
this.logger.error(
`Booking ${booking._id} reached max calendar-write attempts (${maxAttempts}); marked calendar_failed and released its slot lock. Last error: ${booking.lastCalendarError}`,
)
}
return booking.status
}
// Pending bookings still eligible for a calendar-write retry (under the cap).
findPendingForRetry(maxAttempts: number, limit = 100): Promise<BookingDocument[]> {
return this.bookingModel
.find({ status: 'pending', calendarWriteAttempts: { $lt: maxAttempts } })
.sort({ createdAt: 1 })
.limit(limit)
.exec()
} }
// ── Manage / cancel / reschedule ─────────────────────────────────────────── // ── Manage / cancel / reschedule ───────────────────────────────────────────
@@ -0,0 +1,41 @@
import { Injectable, Logger } from '@nestjs/common'
import { ConfigService } from '@nestjs/config'
import { Cron } from '@nestjs/schedule'
import { BookingsService } from './bookings.service.js'
// Calendar-write retry queue (§8.2.4). When the synchronous Stalwart calendar
// write fails during confirmation, the booking is left 'pending' with its
// SlotLock retained (rather than deleted). This cron periodically re-drives the
// write for those pending bookings: on success the booking is promoted to
// 'confirmed' and the confirmation email is sent; once attempts hit the cap the
// booking moves to the terminal 'calendar_failed' state and its lock is
// released. The per-booking attempt counter (and idempotent calendarEventUid)
// lives on the Booking, so this worker is safe to run repeatedly. Max attempts
// is configurable via SCHEDULING_CALENDAR_RETRY_MAX (default 5).
@Injectable()
export class CalendarRetryWorker {
private readonly logger = new Logger(CalendarRetryWorker.name)
private readonly maxAttempts: number
constructor(
private readonly bookings: BookingsService,
config: ConfigService,
) {
const raw = Number(config.get<string>('SCHEDULING_CALENDAR_RETRY_MAX'))
this.maxAttempts = Number.isFinite(raw) && raw > 0 ? Math.floor(raw) : 5
}
// Every 2 minutes (@nestjs/schedule has no EVERY_2_MINUTES preset).
@Cron('*/2 * * * *', { name: 'calendar-write-retry' })
async run(): Promise<void> {
const pending = await this.bookings.findPendingForRetry(this.maxAttempts)
if (pending.length === 0) return
this.logger.log(`Retrying calendar writes for ${pending.length} pending booking(s)`)
for (const booking of pending) {
await this.bookings
.retryPendingCalendarWrite(booking, this.maxAttempts)
.catch((e) => this.logger.warn(`Calendar retry failed for booking ${booking._id}: ${e.message}`))
}
}
}
@@ -14,6 +14,7 @@ import { User, UserSchema } from '../schemas/user.schema.js'
import { TenantsModule } from '../tenants/tenants.module.js' import { TenantsModule } from '../tenants/tenants.module.js'
import { AvailabilityService } from './availability/availability.service.js' import { AvailabilityService } from './availability/availability.service.js'
import { BookingsService } from './bookings/bookings.service.js' import { BookingsService } from './bookings/bookings.service.js'
import { CalendarRetryWorker } from './bookings/calendar-retry.worker.js'
import { JmapMailer } from './email/jmap-mailer.service.js' import { JmapMailer } from './email/jmap-mailer.service.js'
import { EventTypesService } from './event-types/event-types.service.js' import { EventTypesService } from './event-types/event-types.service.js'
import { HostsService } from './hosts/hosts.service.js' import { HostsService } from './hosts/hosts.service.js'
@@ -59,6 +60,7 @@ import { StalwartCalendarModule } from './stalwart-calendar/stalwart-calendar.mo
PublicSchedulingService, PublicSchedulingService,
JmapMailer, JmapMailer,
BookingReminderWorker, BookingReminderWorker,
CalendarRetryWorker,
], ],
}) })
export class SchedulingModule {} export class SchedulingModule {}
@@ -6,7 +6,9 @@ export type BookingDocument = HydratedDocument<Booking>
// 'pending' is the compensating state when the SlotLock is held but the calendar // 'pending' is the compensating state when the SlotLock is held but the calendar
// write hasn't succeeded yet (§8.2.4) — never surfaced as a confirmed booking. // write hasn't succeeded yet (§8.2.4) — never surfaced as a confirmed booking.
export type BookingStatus = 'pending' | 'confirmed' | 'cancelled' | 'rescheduled' // 'calendar_failed' is the terminal state once the retry worker exhausts its
// attempts; the SlotLock is released so the slot frees up.
export type BookingStatus = 'pending' | 'confirmed' | 'cancelled' | 'rescheduled' | 'calendar_failed'
// A confirmed appointment. All instants are UTC; attendee/host tz are IANA // A confirmed appointment. All instants are UTC; attendee/host tz are IANA
// strings for display. `calendarEventUid` is generated client-side BEFORE the // strings for display. `calendarEventUid` is generated client-side BEFORE the
@@ -23,7 +25,7 @@ export class Booking {
@Prop({ type: Types.ObjectId, ref: 'Host', required: true, index: true }) @Prop({ type: Types.ObjectId, ref: 'Host', required: true, index: true })
hostId!: Types.ObjectId hostId!: Types.ObjectId
@Prop({ enum: ['pending', 'confirmed', 'cancelled', 'rescheduled'], default: 'pending', index: true }) @Prop({ enum: ['pending', 'confirmed', 'cancelled', 'rescheduled', 'calendar_failed'], default: 'pending', index: true })
status!: BookingStatus status!: BookingStatus
@Prop({ required: true, index: true }) @Prop({ required: true, index: true })
@@ -84,6 +86,17 @@ export class Booking {
// then atomically appends it — making reminders idempotent across runs. // then atomically appends it — making reminders idempotent across runs.
@Prop({ type: [Number], default: [] }) @Prop({ type: [Number], default: [] })
sentReminderOffsets!: number[] sentReminderOffsets!: number[]
// Calendar-write retry bookkeeping (§8.2.4). When the synchronous Stalwart
// write fails, the booking stays 'pending' with its SlotLock retained and the
// retry worker drives `calendarWriteAttempts` up to the configured max. The
// last error is kept for diagnostics; on terminal failure status becomes
// 'calendar_failed' and the lock is released.
@Prop({ default: 0 })
calendarWriteAttempts!: number
@Prop({ trim: true })
lastCalendarError?: string
} }
export const BookingSchema = SchemaFactory.createForClass(Booking) export const BookingSchema = SchemaFactory.createForClass(Booking)