Published on

The Saga Pattern — Managing Distributed Transactions Without Two-Phase Commit

Authors

Introduction

Distributed transactions across multiple services are one of the hardest problems in microservices architecture. Two-phase commit (2PC) is often too slow and brittle. The saga pattern provides a better way: decompose a distributed transaction into a sequence of local transactions, each compensated by a compensating transaction. We'll explore both choreography and orchestration approaches, tackle idempotent compensation, and handle the complexity of partial failures.

Choreography vs Orchestration

Choreography sagas rely on events: each service listens for events and publishes compensating events if it fails. Orchestration uses a central coordinator that explicitly drives the transaction flow.

Choreography example:

// Event-driven saga for order creation
interface OrderCreatedEvent {
  orderId: string;
  customerId: string;
  items: Array<{ sku: string; quantity: number }>;
  totalAmount: number;
}

class OrderService {
  async createOrder(req: CreateOrderRequest): Promise<string> {
    const orderId = uuid();

    // Publish event; other services subscribe
    await this.eventBus.publish('order.created', {
      orderId,
      customerId: req.customerId,
      items: req.items,
      totalAmount: req.totalAmount,
    });

    return orderId;
  }
}

class InventoryService {
  private eventBus: EventBus;

  constructor(eventBus: EventBus) {
    this.eventBus = eventBus;
    this.eventBus.on('order.created', this.handleOrderCreated.bind(this));
    this.eventBus.on('payment.failed', this.handlePaymentFailed.bind(this));
  }

  async handleOrderCreated(event: OrderCreatedEvent): Promise<void> {
    try {
      await this.reserveInventory(event.orderId, event.items);
      await this.eventBus.publish('inventory.reserved', { orderId: event.orderId });
    } catch (error) {
      await this.eventBus.publish('inventory.reservation.failed', { orderId: event.orderId });
    }
  }

  async handlePaymentFailed(event: { orderId: string }): Promise<void> {
    await this.releaseInventory(event.orderId);
    await this.eventBus.publish('inventory.released', { orderId: event.orderId });
  }

  private async reserveInventory(orderId: string, items: any[]): Promise<void> {
    for (const item of items) {
      const reserved = await this.db.query(
        `UPDATE inventory SET reserved = reserved + $1
         WHERE sku = $2 AND available >= $1 RETURNING id`,
        [item.quantity, item.sku]
      );
      if (reserved.rows.length === 0) {
        throw new Error(`Insufficient stock for ${item.sku}`);
      }
    }
  }

  private async releaseInventory(orderId: string): Promise<void> {
    await this.db.query(
      `UPDATE inventory SET reserved = reserved - quantity
       FROM order_items WHERE order_id = $1`,
      [orderId]
    );
  }
}

class PaymentService {
  async handleInventoryReserved(event: { orderId: string }): Promise<void> {
    try {
      const order = await this.getOrder(event.orderId);
      await this.chargeCard(order.customerId, order.totalAmount);
      await this.eventBus.publish('payment.completed', { orderId: event.orderId });
    } catch (error) {
      await this.eventBus.publish('payment.failed', { orderId: event.orderId });
    }
  }
}

Orchestration example:

// Central saga orchestrator
interface SagaDefinition {
  steps: Array<{
    service: string;
    action: string;
    compensation: string;
  }>;
}

class SagaOrchestrator {
  private eventBus: EventBus;
  private sagaStates = new Map<string, SagaState>();

  async executeSaga(orderId: string): Promise<void> {
    const state: SagaState = {
      orderId,
      status: 'pending',
      completedSteps: [],
      createdAt: Date.now(),
    };
    this.sagaStates.set(orderId, state);

    try {
      // Step 1: Reserve inventory
      await this.callService('inventory', 'reserve', { orderId });
      state.completedSteps.push('inventory_reserved');
      state.status = 'inventory_reserved';

      // Step 2: Process payment
      await this.callService('payment', 'charge', { orderId });
      state.completedSteps.push('payment_completed');
      state.status = 'payment_completed';

      // Step 3: Fulfill order
      await this.callService('fulfillment', 'ship', { orderId });
      state.completedSteps.push('order_fulfilled');
      state.status = 'completed';
    } catch (error) {
      // Compensate in reverse order
      await this.compensate(state);
      state.status = 'compensated';
    }
  }

  private async compensate(state: SagaState): Promise<void> {
    const reversedSteps = [...state.completedSteps].reverse();
    for (const step of reversedSteps) {
      try {
        await this.callCompensation(state.orderId, step);
      } catch (err) {
        // Log and potentially trigger manual intervention
        console.error(`Compensation failed for ${step}:`, err);
      }
    }
  }

  private async callService(service: string, action: string, payload: any): Promise<void> {
    // Implementation: call service via HTTP/gRPC with retries
    const maxRetries = 3;
    for (let i = 0; i < maxRetries; i++) {
      try {
        return await this.serviceRegistry.call(service, action, payload);
      } catch (error) {
        if (i === maxRetries - 1) throw error;
        await sleep(Math.pow(2, i) * 100);
      }
    }
  }

  private async callCompensation(orderId: string, step: string): Promise<void> {
    const compensations: Record<string, string> = {
      inventory_reserved: 'inventory.release',
      payment_completed: 'payment.refund',
      order_fulfilled: 'fulfillment.cancel',
    };
    await this.callService('compensation', compensations[step], { orderId });
  }
}

interface SagaState {
  orderId: string;
  status: 'pending' | 'inventory_reserved' | 'payment_completed' | 'completed' | 'compensated';
  completedSteps: string[];
  createdAt: number;
}

Idempotent Compensations

Compensations must be idempotent—running them twice should produce the same result as running once.

class PaymentService {
  async refund(orderId: string): Promise<void> {
    // Check if refund already processed
    const existing = await this.db.query(
      `SELECT id FROM refunds WHERE order_id = $1 AND status = 'completed'`,
      [orderId]
    );

    if (existing.rows.length > 0) {
      return; // Already refunded, safe to return
    }

    const order = await this.getOrder(orderId);
    const refund = await this.paymentProvider.refund(order.paymentId, order.totalAmount);

    await this.db.query(
      `INSERT INTO refunds (order_id, payment_id, amount, status, idempotency_key)
       VALUES ($1, $2, $3, $4, $5)
       ON CONFLICT (idempotency_key) DO UPDATE SET status = EXCLUDED.status`,
      [orderId, order.paymentId, order.totalAmount, 'completed', `refund-${orderId}`]
    );
  }

  async releaseInventory(orderId: string): Promise<void> {
    // Only release if still reserved
    const result = await this.db.query(
      `UPDATE inventory SET reserved = reserved - quantity
       WHERE order_id = $1 AND reserved > 0
       RETURNING id`,
      [orderId]
    );

    if (result.rows.length === 0) {
      return; // Already released or never reserved
    }

    await this.auditLog.record('inventory_released', { orderId });
  }
}

Saga State Tracking

Persistent saga state enables recovery from failures and provides operational visibility.

class SagaStateStore {
  async saveState(orderId: string, state: SagaState): Promise<void> {
    await this.db.query(
      `INSERT INTO saga_states (order_id, status, completed_steps, updated_at)
       VALUES ($1, $2, $3, NOW())
       ON CONFLICT (order_id) DO UPDATE SET
         status = EXCLUDED.status,
         completed_steps = EXCLUDED.completed_steps,
         updated_at = NOW()`,
      [orderId, state.status, JSON.stringify(state.completedSteps)]
    );
  }

  async getState(orderId: string): Promise<SagaState | null> {
    const result = await this.db.query(
      `SELECT status, completed_steps, updated_at FROM saga_states WHERE order_id = $1`,
      [orderId]
    );
    return result.rows[0]
      ? {
          orderId,
          status: result.rows[0].status,
          completedSteps: JSON.parse(result.rows[0].completed_steps),
          createdAt: result.rows[0].updated_at.getTime(),
        }
      : null;
  }

  async recoverIncompleteTransactions(): Promise<void> {
    const stuckSagas = await this.db.query(
      `SELECT order_id, status FROM saga_states
       WHERE status NOT IN ('completed', 'compensated')
       AND updated_at < NOW() - INTERVAL '5 minutes'`
    );

    for (const saga of stuckSagas.rows) {
      await this.orchestrator.resumeSaga(saga.order_id);
    }
  }
}

Testing Sagas with Failure Injection

describe('OrderSaga', () => {
  it('should compensate when payment fails', async () => {
    const saga = new SagaOrchestrator(eventBus, serviceRegistry);
    serviceRegistry.mockService('payment', 'charge', () => {
      throw new Error('Payment declined');
    });

    await saga.executeSaga('order-123');

    const state = await saga.getState('order-123');
    expect(state.status).toBe('compensated');
    expect(serviceRegistry.wasCalledWith('inventory', 'release')).toBe(true);
  });

  it('should handle compensation failures gracefully', async () => {
    const saga = new SagaOrchestrator(eventBus, serviceRegistry);
    serviceRegistry.mockService('payment', 'charge', () => {
      throw new Error('Payment declined');
    });
    serviceRegistry.mockService('inventory', 'release', () => {
      throw new Error('Inventory service down');
    });

    await saga.executeSaga('order-123');

    const state = await saga.getState('order-123');
    expect(state.status).toBe('compensated');
    expect(await saga.getFailedCompensations()).toContain('inventory_release');
  });
});

When Sagas Are Overkill

Sagas add operational complexity. Consider alternatives:

  • Simple sequences without rollback: just publish events in order
  • Strong consistency required: use transactional database across services (if possible)
  • Rarely fails: monitor and alert instead of building complex compensation

Checklist

  • Identify all steps in your distributed transaction
  • Decide choreography (event-driven) vs orchestration (coordinator)
  • Design idempotent compensating transactions
  • Persist saga state for recovery
  • Implement timeout-based saga completion
  • Test failure scenarios with chaos injection
  • Monitor saga duration and compensation frequency
  • Plan manual intervention procedures for stuck sagas

Conclusion

The saga pattern is powerful for managing distributed transactions without 2PC's blocking. Choreography scales horizontally but is harder to debug; orchestration is explicit but centralizes. Always design idempotent compensations and persist saga state. Test your failure paths thoroughly—production will find scenarios you didn't anticipate.