Monitoring and Observability
13.1 Metrics Collection
// Metrics collected via Workers Analytics Engine
interface WorkloadMetrics {
workloadId: string;
customerId: string;
provider: string;
region: string;
// Timing metrics
provisioningDurationMs: number;
healthCheckLatencyMs: number;
routingLatencyMs: number;
// Cost metrics
spotPricePerHour: number;
totalCostUSD: number;
// Status
status: string;
terminationReason?: string;
}
// Log metrics to Analytics Engine
async function logMetrics(env: Env, metrics: WorkloadMetrics): Promise<void> {
env.ANALYTICS.writeDataPoint({
blobs: [
metrics.workloadId,
metrics.customerId,
metrics.provider,
metrics.region,
metrics.status,
metrics.terminationReason || '',
],
doubles: [
metrics.provisioningDurationMs,
metrics.healthCheckLatencyMs,
metrics.routingLatencyMs,
metrics.spotPricePerHour,
metrics.totalCostUSD,
],
indexes: [metrics.customerId],
});
}
13.2 Alerting Rules
# alerts.yaml - Example alerting configuration
alerts:
- name: HighProvisioningFailureRate
condition: |
rate(provisioning_failures[5m]) / rate(provisioning_attempts[5m]) > 0.1
severity: critical
message: "Provisioning failure rate exceeds 10%"
- name: SpotInterruptionSpike
condition: |
rate(spot_interruptions[15m]) > 10
severity: warning
message: "High spot interruption rate detected"
- name: PriceFeedStale
condition: |
time() - last_price_update > 300
severity: warning
message: "Price feed data is stale (>5 minutes)"
- name: HealthCheckFailures
condition: |
count(health_check_failures[5m]) > 100
severity: warning
message: "High number of health check failures"
- name: KiveraBlockRate
condition: |
rate(kivera_blocks[5m]) / rate(cloud_api_requests[5m]) > 0.05
severity: info
message: "Kivera policy blocks exceeding 5% of requests"
13.3 Audit Logging
// Comprehensive audit logging
interface AuditEvent {
timestamp: number;
customerId: string;
workloadId?: string;
action: string;
actor: {
type: 'user' | 'system' | 'scheduled';
id: string;
ip?: string;
userAgent?: string;
};
resource: {
type: string;
id: string;
};
details: Record<string, unknown>;
outcome: 'success' | 'failure' | 'blocked';
errorMessage?: string;
}
async function logAuditEvent(env: Env, event: AuditEvent): Promise<void> {
// Write to D1 for queryable storage
await env.AUDIT_DB.prepare(`
INSERT INTO audit_log
(customer_id, workload_id, action, actor_type, actor_id, resource_type, resource_id,
details, outcome, error_message, ip_address, user_agent, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`).bind(
event.customerId,
event.workloadId,
event.action,
event.actor.type,
event.actor.id,
event.resource.type,
event.resource.id,
JSON.stringify(event.details),
event.outcome,
event.errorMessage,
event.actor.ip,
event.actor.userAgent,
event.timestamp
).run();
// Also write to R2 for long-term archival (WORM compliance)
const key = `audit/${event.customerId}/${new Date(event.timestamp).toISOString().slice(0, 10)}/${event.timestamp}-${crypto.randomUUID()}.json`;
await env.LOGS.put(key, JSON.stringify(event));
}