Published on

The Bulkhead Pattern — Isolating Failures So One Bad Dependency Doesn't Sink Everything

Authors

Introduction

A slow dependency (payment service taking 10 seconds) consumes all your threads, blocking requests to fast services (user profile, product search). The bulkhead pattern isolates resources: each downstream service (or tenant, or feature) gets its own pool of connections/threads. If one fills up, others continue serving. We'll build bulkheads in Node.js, measure their effectiveness, and size them properly.

Thread Pool Bulkhead

Separate thread pool per downstream service prevents one slow service from blocking others.

import pLimit from 'p-limit';

class BulkheadExecutor {
  private limiters = new Map<string, any>();
  private concurrencyLimits: Record<string, number> = {
    'payment-service': 10,
    'user-service': 50,
    'product-service': 100,
    'default': 20,
  };

  async executeWithBulkhead<T>(
    serviceName: string,
    fn: () => Promise<T>
  ): Promise<T> {
    let limiter = this.limiters.get(serviceName);

    if (!limiter) {
      const limit = this.concurrencyLimits[serviceName] || this.concurrencyLimits['default'];
      limiter = pLimit(limit);
      this.limiters.set(serviceName, limiter);
    }

    try {
      return await limiter(() => fn());
    } catch (error) {
      console.error(`Bulkhead failure for ${serviceName}:`, error);
      throw error;
    }
  }

  // Get current utilization
  getUtilization(serviceName: string): number {
    const limiter = this.limiters.get(serviceName);
    if (!limiter) return 0;
    // Implementation would depend on p-limit internals
    return 0;
  }
}

// Usage in Express middleware
const bulkhead = new BulkheadExecutor();

app.get('/user/:id', async (req, res) => {
  try {
    // User service gets up to 50 concurrent requests
    const user = await bulkhead.executeWithBulkhead(
      'user-service',
      () => userService.getUser(req.params.id)
    );
    res.json(user);
  } catch (error) {
    res.status(503).json({ error: 'Service unavailable' });
  }
});

app.get('/payment/status/:transactionId', async (req, res) => {
  try {
    // Payment service gets only 10 concurrent requests
    // If payment service is slow, it won't block user/product requests
    const status = await bulkhead.executeWithBulkhead(
      'payment-service',
      () => paymentService.getStatus(req.params.transactionId)
    );
    res.json(status);
  } catch (error) {
    res.status(503).json({ error: 'Service unavailable' });
  }
});

Semaphore-Based Bulkhead in Node.js

Semaphores control how many operations can run concurrently.

class Semaphore {
  private permits: number;
  private waiters: Array<() => void> = [];

  constructor(initialPermits: number) {
    this.permits = initialPermits;
  }

  async acquire(): Promise<void> {
    if (this.permits > 0) {
      this.permits--;
      return;
    }

    // Wait for permit to become available
    return new Promise(resolve => {
      this.waiters.push(() => {
        this.permits--;
        resolve();
      });
    });
  }

  release(): void {
    if (this.waiters.length > 0) {
      const waiter = this.waiters.shift();
      waiter?.();
    } else {
      this.permits++;
    }
  }

  async withPermit<T>(fn: () => Promise<T>): Promise<T> {
    await this.acquire();
    try {
      return await fn();
    } finally {
      this.release();
    }
  }

  getAvailablePermits(): number {
    return this.permits;
  }

  getQueuedWaiters(): number {
    return this.waiters.length;
  }
}

class SemaphoreBulkhead {
  private semaphores = new Map<string, Semaphore>();

  constructor(private limits: Record<string, number>) {
    for (const [service, limit] of Object.entries(limits)) {
      this.semaphores.set(service, new Semaphore(limit));
    }
  }

  async executeWithBulkhead<T>(
    serviceName: string,
    fn: () => Promise<T>
  ): Promise<T> {
    const semaphore = this.semaphores.get(serviceName);
    if (!semaphore) {
      throw new Error(`No bulkhead configured for ${serviceName}`);
    }

    return semaphore.withPermit(fn);
  }

  getMetrics(serviceName: string): { available: number; waiting: number } {
    const semaphore = this.semaphores.get(serviceName);
    if (!semaphore) return { available: 0, waiting: 0 };

    return {
      available: semaphore.getAvailablePermits(),
      waiting: semaphore.getQueuedWaiters(),
    };
  }

  getAllMetrics(): Record<string, { available: number; waiting: number }> {
    const result: Record<string, { available: number; waiting: number }> = {};
    for (const [service, _] of this.semaphores) {
      result[service] = this.getMetrics(service);
    }
    return result;
  }
}

// Usage
const bulkhead = new SemaphoreBulkhead({
  'payment-service': 10,
  'user-service': 50,
  'product-service': 100,
});

app.get('/user/:id', async (req, res) => {
  try {
    const user = await bulkhead.executeWithBulkhead(
      'user-service',
      () => userService.getUser(req.params.id)
    );
    res.json(user);
  } catch (error) {
    res.status(503).json({ error: 'Service temporarily unavailable' });
  }
});

// Monitor bulkhead health
app.get('/metrics/bulkhead', (req, res) => {
  res.json(bulkhead.getAllMetrics());
});

Bulkhead + Circuit Breaker Combination

Combine bulkheads with circuit breakers for complete isolation.

class ResilientBulkhead {
  private bulkhead: SemaphoreBulkhead;
  private breakers = new Map<string, CircuitBreaker>();

  constructor(
    private serviceLimits: Record<string, number>,
    private failureThreshold = 5,
    private resetTimeout = 60000
  ) {
    this.bulkhead = new SemaphoreBulkhead(serviceLimits);
  }

  async execute<T>(
    serviceName: string,
    fn: () => Promise<T>
  ): Promise<T> {
    const breaker = this.getOrCreateBreaker(serviceName);

    return breaker.execute(async () => {
      return this.bulkhead.executeWithBulkhead(serviceName, fn);
    });
  }

  private getOrCreateBreaker(serviceName: string): CircuitBreaker {
    if (!this.breakers.has(serviceName)) {
      this.breakers.set(
        serviceName,
        new CircuitBreaker(
          this.failureThreshold,
          this.resetTimeout,
          serviceName
        )
      );
    }
    return this.breakers.get(serviceName)!;
  }

  getHealth(): Record<string, { bulkhead: any; circuit: string }> {
    const health: Record<string, { bulkhead: any; circuit: string }> = {};

    for (const [service, _] of this.breakers) {
      health[service] = {
        bulkhead: this.bulkhead.getMetrics(service),
        circuit: this.breakers.get(service)?.getState() || 'UNKNOWN',
      };
    }

    return health;
  }
}

class CircuitBreaker {
  private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
  private failureCount = 0;
  private successCount = 0;
  private lastFailureTime = 0;

  constructor(
    private failureThreshold: number,
    private resetTimeout: number,
    private name: string
  ) {}

  async execute<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === 'OPEN') {
      if (Date.now() - this.lastFailureTime > this.resetTimeout) {
        this.state = 'HALF_OPEN';
        this.successCount = 0;
      } else {
        throw new Error(`Circuit breaker OPEN for ${this.name}`);
      }
    }

    try {
      const result = await fn();

      if (this.state === 'HALF_OPEN') {
        this.successCount++;
        if (this.successCount >= 3) {
          this.state = 'CLOSED';
          this.failureCount = 0;
        }
      }

      return result;
    } catch (error) {
      this.failureCount++;
      this.lastFailureTime = Date.now();

      if (this.failureCount >= this.failureThreshold) {
        this.state = 'OPEN';
      }

      throw error;
    }
  }

  getState(): string {
    return this.state;
  }
}

Resource Isolation by Tenant

Prevent noisy neighbor: one tenant using all resources blocks others.

class TenantBulkhead {
  private tenantLimiters = new Map<string, Semaphore>();
  private defaultLimit = 20;
  private tenantLimits: Map<string, number> = new Map([
    ['enterprise-customer', 100],
    ['vip-customer', 50],
    ['default', 10],
  ]);

  async executeForTenant<T>(
    tenantId: string,
    fn: () => Promise<T>
  ): Promise<T> {
    let semaphore = this.tenantLimiters.get(tenantId);

    if (!semaphore) {
      const limit =
        this.tenantLimits.get(tenantId) ||
        this.tenantLimits.get('default') ||
        this.defaultLimit;
      semaphore = new Semaphore(limit);
      this.tenantLimiters.set(tenantId, semaphore);
    }

    return semaphore.withPermit(fn);
  }

  getTenantMetrics(tenantId: string): {
    available: number;
    waiting: number;
  } {
    const semaphore = this.tenantLimiters.get(tenantId);
    if (!semaphore) return { available: 0, waiting: 0 };

    return {
      available: semaphore.getAvailablePermits(),
      waiting: semaphore.getQueuedWaiters(),
    };
  }

  // Clean up idle tenants
  cleanupIdleTenants(inactivityMs = 3600000): void {
    setInterval(() => {
      const now = Date.now();
      for (const [tenantId, semaphore] of this.tenantLimiters) {
        // Simple heuristic: if no waiting and rarely used, remove
        if (
          semaphore.getQueuedWaiters() === 0 &&
          semaphore.getAvailablePermits() === (this.tenantLimits.get(tenantId) || this.defaultLimit)
        ) {
          this.tenantLimiters.delete(tenantId);
        }
      }
    }, inactivityMs);
  }
}

// Usage with tenant from request context
app.use((req, res, next) => {
  (req as any).tenantId = req.headers['x-tenant-id'];
  next();
});

app.post('/data', async (req, res) => {
  const tenantId = (req as any).tenantId;

  try {
    const result = await tenantBulkhead.executeForTenant(
      tenantId,
      () => processData(req.body)
    );
    res.json(result);
  } catch (error) {
    res.status(503).json({ error: 'Tenant quota exceeded' });
  }
});

const tenantBulkhead = new TenantBulkhead();

Queue Depth Limits as Bulkhead

When a queue grows too deep, it signals saturation. Reject new requests.

class QueueBasedBulkhead<T> {
  private queue: T[] = [];
  private maxQueueSize = 1000;
  private processing = 0;
  private concurrency = 10;

  async enqueue(item: T): Promise<void> {
    if (this.queue.length >= this.maxQueueSize) {
      throw new Error('Queue full; rejecting request');
    }

    this.queue.push(item);
    this.processQueue();
  }

  private async processQueue(): Promise<void> {
    while (this.processing < this.concurrency && this.queue.length > 0) {
      const item = this.queue.shift();
      if (!item) break;

      this.processing++;

      try {
        await this.process(item);
      } catch (error) {
        console.error('Failed to process item:', error);
      } finally {
        this.processing--;
      }
    }
  }

  private async process(item: T): Promise<void> {
    // Implementation
  }

  getMetrics(): { queued: number; processing: number; utilization: number } {
    return {
      queued: this.queue.length,
      processing: this.processing,
      utilization: (this.processing + this.queue.length) / (this.concurrency + this.maxQueueSize),
    };
  }
}

Measuring Bulkhead Effectiveness

class BulkheadMetrics {
  private metrics: Map<
    string,
    {
      requests: number;
      successful: number;
      rejected: number;
      bulkheadExhausted: number;
      latencies: number[];
    }
  > = new Map();

  recordAttempt(serviceName: string): void {
    const metric = this.getOrCreate(serviceName);
    metric.requests++;
  }

  recordSuccess(serviceName: string, latency: number): void {
    const metric = this.getOrCreate(serviceName);
    metric.successful++;
    metric.latencies.push(latency);
  }

  recordRejection(serviceName: string, reason: 'bulkhead' | 'circuit'): void {
    const metric = this.getOrCreate(serviceName);
    metric.rejected++;
    if (reason === 'bulkhead') {
      metric.bulkheadExhausted++;
    }
  }

  private getOrCreate(serviceName: string) {
    if (!this.metrics.has(serviceName)) {
      this.metrics.set(serviceName, {
        requests: 0,
        successful: 0,
        rejected: 0,
        bulkheadExhausted: 0,
        latencies: [],
      });
    }
    return this.metrics.get(serviceName)!;
  }

  getReport(serviceName: string): {
    successRate: number;
    rejectionRate: number;
    bulkheadExhaustionRate: number;
    p50Latency: number;
    p99Latency: number;
  } {
    const metric = this.metrics.get(serviceName);
    if (!metric || metric.requests === 0) {
      return {
        successRate: 0,
        rejectionRate: 0,
        bulkheadExhaustionRate: 0,
        p50Latency: 0,
        p99Latency: 0,
      };
    }

    const sorted = [...metric.latencies].sort((a, b) => a - b);

    return {
      successRate: metric.successful / metric.requests,
      rejectionRate: metric.rejected / metric.requests,
      bulkheadExhaustionRate: metric.bulkheadExhausted / metric.requests,
      p50Latency: sorted[Math.floor(sorted.length * 0.5)] || 0,
      p99Latency: sorted[Math.floor(sorted.length * 0.99)] || 0,
    };
  }
}

Bulkhead Sizing Formula

Rule of thumb: bulkheadSize = (expectedLatency * throughput) + buffer

class BulkheadSizer {
  /**
   * Calculate optimal bulkhead size
   * @param expectedLatency - Expected P99 latency in ms
   * @param targetThroughput - Desired requests per second
   * @param bufferFraction - Extra capacity (default 0.3 = 30% buffer)
   */
  static calculateSize(
    expectedLatency: number,
    targetThroughput: number,
    bufferFraction = 0.3
  ): number {
    // Convert throughput from RPS to ms
    const latencyInSeconds = expectedLatency / 1000;

    // Base size: how many requests fit in the latency window
    const baseSize = latencyInSeconds * targetThroughput;

    // Add buffer for spikes
    const bufferSize = baseSize * bufferFraction;

    return Math.ceil(baseSize + bufferSize);
  }

  /**
   * Example scenarios
   */
  static examples(): void {
    // Payment service: 2 second P99 latency, need 10 req/sec
    const paymentSize = this.calculateSize(2000, 10);
    console.log(`Payment service bulkhead: ${paymentSize}`); // (2 * 10) + 6 = 26

    // Search service: 500ms P99, need 100 req/sec
    const searchSize = this.calculateSize(500, 100);
    console.log(`Search service bulkhead: ${searchSize}`); // (0.5 * 100) + 15 = 65

    // Cache service: 10ms P99, need 1000 req/sec
    const cacheSize = this.calculateSize(10, 1000);
    console.log(`Cache service bulkhead: ${cacheSize}`); // (0.01 * 1000) + 3 = 13
  }
}

Failure Injection Testing

Test bulkheads under load to verify they isolate failures.

describe('BulkheadIsolation', () => {
  it('should isolate slow service from fast service', async () => {
    const bulkhead = new SemaphoreBulkhead({
      'fast-service': 50,
      'slow-service': 10,
    });

    let fastSuccessful = 0;
    let slowSuccessful = 0;

    // Simulate slow service: 5 second latency
    const slowService = async () => {
      await sleep(5000);
      slowSuccessful++;
    };

    // Simulate fast service: 10ms latency
    const fastService = async () => {
      await sleep(10);
      fastSuccessful++;
    };

    // Saturate slow service
    for (let i = 0; i < 15; i++) {
      bulkhead.executeWithBulkhead('slow-service', slowService).catch(() => {});
    }

    // Try to use fast service while slow is saturated
    await sleep(100);
    for (let i = 0; i < 50; i++) {
      await bulkhead.executeWithBulkhead('fast-service', fastService);
    }

    // Fast service should complete despite slow service saturation
    expect(fastSuccessful).toBeGreaterThan(40);
  });

  it('should reject requests when bulkhead exhausted', async () => {
    const bulkhead = new SemaphoreBulkhead({ 'limited-service': 5 });
    let rejected = 0;

    // Saturate bulkhead
    for (let i = 0; i < 10; i++) {
      bulkhead
        .executeWithBulkhead('limited-service', () => sleep(1000))
        .catch(() => {
          rejected++;
        });
    }

    await sleep(100);

    // Additional requests should be queued or rejected
    expect(rejected).toBeGreaterThan(0);
  });
});

Checklist

  • Size bulkheads based on latency and throughput, not arbitrarily
  • Combine bulkheads with circuit breakers for complete isolation
  • Use semaphores or p-limit to enforce concurrency limits
  • Monitor bulkhead exhaustion as key metric
  • Isolate tenants with separate bulkheads
  • Set queue depth limits to reject early
  • Test under chaos conditions (latency injection, failure injection)
  • Measure bulkhead effectiveness (rejection rate, latency distribution)
  • Document bulkhead sizes and how they were calculated
  • Alert on bulkhead exhaustion before requests fail

Conclusion

The bulkhead pattern is one of your most powerful tools for preventing cascading failures. Size them correctly—too small and you limit throughput, too large and you don't isolate. Combine with circuit breakers for defense in depth. Test under load: the moment one service slows down is when you most want to confirm that other services continue operating normally.