Failure Handling

12.1 Failure Scenarios and Mitigation

ScenarioDetectionMitigationUser Impact
Spot interruptionInterruption notice from providerNotify user, graceful shutdownWorkload terminates with notice
Instance failureHealth check fails 3xMark unhealthy, notify userRouting returns 503
Cloud API failureAPI call timeout/errorRetry with backoff, alertProvisioning delayed
Kivera policy blockBlock response from KiveraReturn clear error messageRequest rejected with reason
Price feed failureStale data detectionUse cached prices, alertMay use slightly stale pricing
D1 unavailableRequest timeoutRetry, fallback to KV cacheBrief degradation
KV propagation delayN/A (expected behavior)Design for eventual consistencyRouting may be stale ~60s

12.2 Graceful Degradation

// Graceful degradation for routing lookups
async function getWorkloadEndpoint(
  env: Env,
  workloadId: string
): Promise<string | null> {
  // Try KV first (fastest)
  let endpoint = await env.ROUTING_KV.get(`workload:${workloadId}:endpoint`);
  
  if (endpoint) {
    return endpoint;
  }
  
  // Fallback to D1
  try {
    const row = await env.DB.prepare(
      'SELECT endpoint FROM workload_routing WHERE workload_id = ?'
    ).bind(workloadId).first();
    
    if (row?.endpoint) {
      // Re-populate KV cache
      await env.ROUTING_KV.put(
        `workload:${workloadId}:endpoint`,
        row.endpoint,
        { expirationTtl: 3600 }
      );
      return row.endpoint;
    }
  } catch (e) {
    console.error('D1 lookup failed:', e);
  }
  
  // Fallback to Durable Object state
  try {
    const healthMonitor = env.HEALTH_MONITOR.get(
      env.HEALTH_MONITOR.idFromName(workloadId)
    );
    const state = await healthMonitor.getState();
    return state?.endpoint || null;
  } catch (e) {
    console.error('DO lookup failed:', e);
  }
  
  return null;
}

12.3 Circuit Breaker Pattern

// Circuit breaker for cloud provider APIs
class CircuitBreaker {
  private failures: number = 0;
  private lastFailure: number = 0;
  private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
  
  private readonly threshold = 5;
  private readonly timeout = 60000; // 1 minute
  
  async execute<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === 'OPEN') {
      if (Date.now() - this.lastFailure > this.timeout) {
        this.state = 'HALF_OPEN';
      } else {
        throw new Error('Circuit breaker is OPEN');
      }
    }
    
    try {
      const result = await fn();
      this.onSuccess();
      return result;
    } catch (e) {
      this.onFailure();
      throw e;
    }
  }
  
  private onSuccess(): void {
    this.failures = 0;
    this.state = 'CLOSED';
  }
  
  private onFailure(): void {
    this.failures++;
    this.lastFailure = Date.now();
    
    if (this.failures >= this.threshold) {
      this.state = 'OPEN';
    }
  }
}

// Usage per provider-region
const circuitBreakers = new Map<string, CircuitBreaker>();

function getCircuitBreaker(provider: string, region: string): CircuitBreaker {
  const key = `${provider}:${region}`;
  if (!circuitBreakers.has(key)) {
    circuitBreakers.set(key, new CircuitBreaker());
  }
  return circuitBreakers.get(key)!;
}