Failure Handling
12.1 Failure Scenarios and Mitigation
| Scenario | Detection | Mitigation | User Impact |
|---|---|---|---|
| Spot interruption | Interruption notice from provider | Notify user, graceful shutdown | Workload terminates with notice |
| Instance failure | Health check fails 3x | Mark unhealthy, notify user | Routing returns 503 |
| Cloud API failure | API call timeout/error | Retry with backoff, alert | Provisioning delayed |
| Kivera policy block | Block response from Kivera | Return clear error message | Request rejected with reason |
| Price feed failure | Stale data detection | Use cached prices, alert | May use slightly stale pricing |
| D1 unavailable | Request timeout | Retry, fallback to KV cache | Brief degradation |
| KV propagation delay | N/A (expected behavior) | Design for eventual consistency | Routing may be stale ~60s |
12.2 Graceful Degradation
// Graceful degradation for routing lookups
async function getWorkloadEndpoint(
env: Env,
workloadId: string
): Promise<string | null> {
// Try KV first (fastest)
let endpoint = await env.ROUTING_KV.get(`workload:${workloadId}:endpoint`);
if (endpoint) {
return endpoint;
}
// Fallback to D1
try {
const row = await env.DB.prepare(
'SELECT endpoint FROM workload_routing WHERE workload_id = ?'
).bind(workloadId).first();
if (row?.endpoint) {
// Re-populate KV cache
await env.ROUTING_KV.put(
`workload:${workloadId}:endpoint`,
row.endpoint,
{ expirationTtl: 3600 }
);
return row.endpoint;
}
} catch (e) {
console.error('D1 lookup failed:', e);
}
// Fallback to Durable Object state
try {
const healthMonitor = env.HEALTH_MONITOR.get(
env.HEALTH_MONITOR.idFromName(workloadId)
);
const state = await healthMonitor.getState();
return state?.endpoint || null;
} catch (e) {
console.error('DO lookup failed:', e);
}
return null;
}
12.3 Circuit Breaker Pattern
// Circuit breaker for cloud provider APIs
class CircuitBreaker {
private failures: number = 0;
private lastFailure: number = 0;
private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
private readonly threshold = 5;
private readonly timeout = 60000; // 1 minute
async execute<T>(fn: () => Promise<T>): Promise<T> {
if (this.state === 'OPEN') {
if (Date.now() - this.lastFailure > this.timeout) {
this.state = 'HALF_OPEN';
} else {
throw new Error('Circuit breaker is OPEN');
}
}
try {
const result = await fn();
this.onSuccess();
return result;
} catch (e) {
this.onFailure();
throw e;
}
}
private onSuccess(): void {
this.failures = 0;
this.state = 'CLOSED';
}
private onFailure(): void {
this.failures++;
this.lastFailure = Date.now();
if (this.failures >= this.threshold) {
this.state = 'OPEN';
}
}
}
// Usage per provider-region
const circuitBreakers = new Map<string, CircuitBreaker>();
function getCircuitBreaker(provider: string, region: string): CircuitBreaker {
const key = `${provider}:${region}`;
if (!circuitBreakers.has(key)) {
circuitBreakers.set(key, new CircuitBreaker());
}
return circuitBreakers.get(key)!;
}