Monitoring and Observability

13.1 Metrics Collection

// Metrics collected via Workers Analytics Engine
interface WorkloadMetrics {
  workloadId: string;
  customerId: string;
  provider: string;
  region: string;
  
  // Timing metrics
  provisioningDurationMs: number;
  healthCheckLatencyMs: number;
  routingLatencyMs: number;
  
  // Cost metrics
  spotPricePerHour: number;
  totalCostUSD: number;
  
  // Status
  status: string;
  terminationReason?: string;
}

// Log metrics to Analytics Engine
async function logMetrics(env: Env, metrics: WorkloadMetrics): Promise<void> {
  env.ANALYTICS.writeDataPoint({
    blobs: [
      metrics.workloadId,
      metrics.customerId,
      metrics.provider,
      metrics.region,
      metrics.status,
      metrics.terminationReason || '',
    ],
    doubles: [
      metrics.provisioningDurationMs,
      metrics.healthCheckLatencyMs,
      metrics.routingLatencyMs,
      metrics.spotPricePerHour,
      metrics.totalCostUSD,
    ],
    indexes: [metrics.customerId],
  });
}

13.2 Alerting Rules

# alerts.yaml - Example alerting configuration

alerts:
  - name: HighProvisioningFailureRate
    condition: |
      rate(provisioning_failures[5m]) / rate(provisioning_attempts[5m]) > 0.1
    severity: critical
    message: "Provisioning failure rate exceeds 10%"
    
  - name: SpotInterruptionSpike
    condition: |
      rate(spot_interruptions[15m]) > 10
    severity: warning
    message: "High spot interruption rate detected"
    
  - name: PriceFeedStale
    condition: |
      time() - last_price_update > 300
    severity: warning
    message: "Price feed data is stale (>5 minutes)"
    
  - name: HealthCheckFailures
    condition: |
      count(health_check_failures[5m]) > 100
    severity: warning
    message: "High number of health check failures"
    
  - name: KiveraBlockRate
    condition: |
      rate(kivera_blocks[5m]) / rate(cloud_api_requests[5m]) > 0.05
    severity: info
    message: "Kivera policy blocks exceeding 5% of requests"

13.3 Audit Logging

// Comprehensive audit logging
interface AuditEvent {
  timestamp: number;
  customerId: string;
  workloadId?: string;
  action: string;
  actor: {
    type: 'user' | 'system' | 'scheduled';
    id: string;
    ip?: string;
    userAgent?: string;
  };
  resource: {
    type: string;
    id: string;
  };
  details: Record<string, unknown>;
  outcome: 'success' | 'failure' | 'blocked';
  errorMessage?: string;
}

async function logAuditEvent(env: Env, event: AuditEvent): Promise<void> {
  // Write to D1 for queryable storage
  await env.AUDIT_DB.prepare(`
    INSERT INTO audit_log 
    (customer_id, workload_id, action, actor_type, actor_id, resource_type, resource_id, 
     details, outcome, error_message, ip_address, user_agent, created_at)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
  `).bind(
    event.customerId,
    event.workloadId,
    event.action,
    event.actor.type,
    event.actor.id,
    event.resource.type,
    event.resource.id,
    JSON.stringify(event.details),
    event.outcome,
    event.errorMessage,
    event.actor.ip,
    event.actor.userAgent,
    event.timestamp
  ).run();
  
  // Also write to R2 for long-term archival (WORM compliance)
  const key = `audit/${event.customerId}/${new Date(event.timestamp).toISOString().slice(0, 10)}/${event.timestamp}-${crypto.randomUUID()}.json`;
  await env.LOGS.put(key, JSON.stringify(event));
}