Published on

SRE Runbook Automation — Turning Manual Procedures Into Self-Healing Systems

Authors

Introduction

Runbooks are great until you need them at 2 AM. The best runbook is one that doesn't require a human. Automation transforms manual procedures into decision trees: "If this alert fires and CPU is high, automatically scale up and log it. If it fires and CPU is normal, page on-call." This post covers structuring runbooks as code, automating remediation for common scenarios, measuring effectiveness, and knowing when to page humans versus self-heal.

Runbook as Code Structure

Define runbooks as YAML with executable steps, decision trees, and rollback logic:

# runbooks/high-error-rate.yaml
metadata:
  name: high-error-rate
  title: High Error Rate Response
  severity: P1
  owner: backend-team
  created_at: 2026-03-01
  last_updated: 2026-03-15
  tags:
    - errors
    - api
    - critical

alert_mapping:
  - alert_name: HighErrorRate
    threshold: error_rate > 0.05
    duration: 5m

auto_remediation:
  enabled: true
  max_attempts: 3
  backoff_multiplier: 2.0
  timeout: 10m

steps:
  - id: check-health
    name: Check service health
    type: check
    condition: |
      {
        "metric": "http_requests_total{status=~'5..'}",
        "comparison": ">",
        "threshold": 50,
        "duration": "5m"
      }
    on_success: restart-service
    on_failure: page-oncall

  - id: restart-service
    name: Restart affected service
    type: action
    auto_execute: true
    action: kubectl rollout restart deployment/api -n production
    verify:
      - name: Verify deployment is healthy
        type: check
        condition: |
          {
            "metric": "deployment_ready_replicas{deployment='api'}",
            "comparison": "==",
            "value_from_plan": "deployment_desired_replicas"
          }
        timeout: 5m
    rollback:
      - kubectl rollout undo deployment/api -n production
    on_success: monitor
    on_failure: escalate

  - id: clear-cache
    name: Clear application cache
    type: action
    auto_execute: true
    action: |
      redis-cli EVAL "return redis.call('del', unpack(redis.call('keys', ARGV[1])))" 0 "app:*"
    verify:
      - name: Wait 30 seconds for cache to warm
        type: wait
        duration: 30s
    on_success: monitor

  - id: scale-up
    name: Scale service to handle load
    type: action
    auto_execute: true
    action: |
      kubectl scale deployment/api --replicas=10 -n production
    verify:
      - name: Wait for new pods
        type: check
        condition: |
          {
            "metric": "deployment_ready_replicas{deployment='api'}",
            "comparison": ">=",
            "threshold": 10
          }
        timeout: 5m
    on_success: monitor

  - id: monitor
    name: Monitor metrics for recovery
    type: monitor
    metrics:
      - error_rate
      - latency_p99
      - cpu_utilization
    duration: 10m
    success_criteria:
      error_rate: "< 0.01"
      latency_p99: "< 500ms"
      cpu_utilization: "< 0.7"
    on_success: notify-success
    on_failure: page-oncall

  - id: page-oncall
    name: Page on-call engineer
    type: escalation
    auto_execute: false
    escalation_policy: backend-team-primary
    urgency: critical
    details: |
      Automated remediation failed for high error rate alert.
      Last attempted action: {{ last_action }}
      Error: {{ error_message }}
      Runbook: {{ runbook_url }}

  - id: escalate
    name: Escalate to engineering lead
    type: escalation
    escalation_policy: engineering-leadership
    urgency: critical
    page_after: 5m

  - id: notify-success
    name: Notify team of auto-remediation
    type: notification
    channel: slack
    message: |
      :white_check_mark: Auto-remediation successful for high error rate
      - Service restarted and recovered
      - New error rate: {{ current_error_rate }}
      - Recovery time: {{ recovery_duration }}

decision_tree:
  root:
    condition: error_rate > 0.05
    true: check_recent_deployments
    false: exit

  check_recent_deployments:
    condition: deployment_changed_in_last(30m)
    true: rollback_recent_deployment
    false: check_resource_constraints

  check_resource_constraints:
    condition: cpu_utilization > 0.8 AND available_memory < 1GB
    true: scale_up
    false: check_database_health

  check_database_health:
    condition: database_connection_errors > 100
    true: page_oncall
    false: clear_cache_and_monitor

Automated Remediation Rules

Create decision logic for different failure scenarios:

// automation/remediation-engine.ts
import { KubeConfig, AppsV1Api } from '@kubernetes/client-node';
import { PrometheusClient } from './prometheus-client';

interface RemediationRule {
  id: string;
  trigger: {
    metric: string;
    operator: '>' | '<' | '>=' | '<=';
    threshold: number;
    duration: string;
  };
  actions: RemediationAction[];
  verify: VerificationStep[];
  rollbackOn?: string[];
  maxAttempts?: number;
}

interface RemediationAction {
  type: 'scale' | 'restart' | 'cache-clear' | 'circuit-break' | 'rate-limit';
  target: string;
  params: Record<string, any>;
}

interface VerificationStep {
  metric: string;
  expectedValue: number;
  operator: string;
  timeout: number;
}

const k8s = new KubeConfig();
k8s.loadFromDefault();
const appsApi = k8s.makeApiClient(AppsV1Api);
const prometheus = new PrometheusClient('http://prometheus:9090');

const remediationRules: RemediationRule[] = [
  {
    id: 'high-cpu-auto-scale',
    trigger: {
      metric: 'node_cpu_usage_percent',
      operator: '>',
      threshold: 85,
      duration: '5m',
    },
    actions: [
      {
        type: 'scale',
        target: 'deployment/api',
        params: {
          replicas: 10,
          namespace: 'production',
        },
      },
    ],
    verify: [
      {
        metric: 'deployment_ready_replicas{deployment="api"}',
        expectedValue: 10,
        operator: '>=',
        timeout: 300,
      },
      {
        metric: 'node_cpu_usage_percent',
        expectedValue: 75,
        operator: '<=',
        timeout: 600,
      },
    ],
    rollbackOn: ['scale_timeout', 'verify_failed'],
    maxAttempts: 3,
  },

  {
    id: 'stuck-job-restart',
    trigger: {
      metric: 'k8s_job_duration_seconds',
      operator: '>',
      threshold: 3600, // 1 hour
      duration: '10m',
    },
    actions: [
      {
        type: 'restart',
        target: 'job/nightly-import',
        params: {
          namespace: 'jobs',
          deleteSuccessfulPods: false,
        },
      },
    ],
    verify: [
      {
        metric: 'k8s_job_active_pods{job="nightly-import"}',
        expectedValue: 1,
        operator: '>=',
        timeout: 60,
      },
    ],
    maxAttempts: 2,
  },

  {
    id: 'cache-saturation-evict',
    trigger: {
      metric: 'redis_memory_used_bytes / redis_memory_max_bytes',
      operator: '>',
      threshold: 0.9,
      duration: '5m',
    },
    actions: [
      {
        type: 'cache-clear',
        target: 'redis/production',
        params: {
          pattern: 'session:*',
          retentionPolicy: 'oldest-first',
        },
      },
    ],
    verify: [
      {
        metric: 'redis_memory_used_bytes / redis_memory_max_bytes',
        expectedValue: 0.7,
        operator: '<=',
        timeout: 120,
      },
    ],
  },

  {
    id: 'circuit-breaker-open-rate-limit',
    trigger: {
      metric: 'circuit_breaker_state{name="external-api"}',
      operator: '==',
      threshold: 1, // Open state
      duration: '30s',
    },
    actions: [
      {
        type: 'rate-limit',
        target: 'external-api-client',
        params: {
          rps_limit: 10,
          duration: 300,
          backoff_multiplier: 1.5,
        },
      },
    ],
    verify: [
      {
        metric: 'circuit_breaker_state{name="external-api"}',
        expectedValue: 2, // Half-open state
        operator: '==',
        timeout: 180,
      },
    ],
  },
];

export async function executeRemediationRule(
  ruleId: string,
  alertContext: Record<string, any>
) {
  const rule = remediationRules.find((r) => r.id === ruleId);
  if (!rule) {
    throw new Error(`Rule not found: ${ruleId}`);
  }

  let attempts = 0;
  const maxAttempts = rule.maxAttempts || 1;

  while (attempts < maxAttempts) {
    attempts++;
    console.log(
      `Executing remediation rule ${ruleId}, attempt ${attempts}/${maxAttempts}`
    );

    try {
      // Execute each action
      for (const action of rule.actions) {
        await executeAction(action);
      }

      // Verify each step
      let allVerified = true;
      for (const step of rule.verify) {
        const verified = await verifyStep(step);
        if (!verified) {
          console.log(
            `Verification failed: ${step.metric} did not meet expectation`
          );
          allVerified = false;
          break;
        }
      }

      if (allVerified) {
        console.log(`Remediation rule ${ruleId} succeeded`);
        return { success: true, attempts, duration: Date.now() };
      }

      // If verification failed and we have rollback rules
      if (rule.rollbackOn?.includes('verify_failed')) {
        await rollbackActions(rule.actions);
      }
    } catch (error) {
      console.error(`Remediation rule ${ruleId} error: ${error}`);

      if (rule.rollbackOn?.includes('action_error')) {
        await rollbackActions(rule.actions);
      }
    }

    if (attempts < maxAttempts) {
      const backoffMs = Math.pow(2, attempts - 1) * 1000; // Exponential backoff
      console.log(`Retrying after ${backoffMs}ms`);
      await new Promise((resolve) => setTimeout(resolve, backoffMs));
    }
  }

  throw new Error(
    `Remediation rule ${ruleId} failed after ${maxAttempts} attempts`
  );
}

async function executeAction(action: RemediationAction) {
  switch (action.type) {
    case 'scale':
      const [resource, name] = action.target.split('/');
      const deployment = await appsApi.readNamespacedDeployment(
        name,
        action.params.namespace
      );
      deployment.body.spec!.replicas = action.params.replicas;
      await appsApi.patchNamespacedDeployment(
        name,
        action.params.namespace,
        deployment.body
      );
      console.log(`Scaled ${action.target} to ${action.params.replicas}`);
      break;

    case 'restart':
      const [resource2, name2] = action.target.split('/');
      await appsApi.patchNamespacedDeploymentSpec(
        name2,
        action.params.namespace,
        {
          spec: {
            template: {
              metadata: {
                annotations: {
                  'restartedAt': new Date().toISOString(),
                },
              },
            },
          },
        }
      );
      console.log(`Restarted ${action.target}`);
      break;

    case 'cache-clear':
      console.log(`Clearing cache: ${action.target}`);
      // Redis EVAL script for clearing
      break;

    default:
      throw new Error(`Unknown action type: ${action.type}`);
  }
}

async function verifyStep(step: VerificationStep): Promise<boolean> {
  const timeout = step.timeout * 1000;
  const startTime = Date.now();

  while (Date.now() - startTime < timeout) {
    try {
      const result = await prometheus.query(step.metric);
      const value = result[0]?.value?.[1];

      const verified =
        step.operator === '>='
          ? value >= step.expectedValue
          : step.operator === '<='
            ? value <= step.expectedValue
            : step.operator === '=='
              ? value === step.expectedValue
              : false;

      if (verified) {
        return true;
      }
    } catch (error) {
      console.error(`Query failed: ${step.metric} - ${error}`);
    }

    await new Promise((resolve) => setTimeout(resolve, 5000));
  }

  return false;
}

async function rollbackActions(actions: RemediationAction[]) {
  // Reverse order rollback
  for (let i = actions.length - 1; i >= 0; i--) {
    const action = actions[i];
    console.log(`Rolling back action: ${action.type}`);

    // Implement rollback logic per action type
  }
}

PagerDuty Integration and Runbook Linking

Link runbooks directly in PagerDuty alerts:

// automation/pagerduty-integration.ts
import { PagerDutyClient } from '@pagerduty/pdjs';

const pdClient = new PagerDutyClient({
  token: process.env.PAGERDUTY_API_TOKEN,
});

export async function createIncidentWithRunbook(
  title: string,
  severity: 'critical' | 'high' | 'medium' | 'low',
  runbookUrl: string,
  escalationPolicyId: string
) {
  const incident = await pdClient.incidents.create({
    incident: {
      type: 'incident',
      title,
      service: {
        id: 'PXXXXX', // Service ID
        type: 'service_reference',
      },
      urgency: severity === 'critical' ? 'high' : 'low',
      body: {
        type: 'incident_body',
        details: `Runbook: ${runbookUrl}`,
      },
      escalation_policy: {
        type: 'escalation_policy_reference',
        id: escalationPolicyId,
      },
    },
  });

  // Add runbook as a note
  await pdClient.incidents.notes.create(incident.id, {
    note: {
      content: `**Runbook**: [View Runbook](${runbookUrl})`,
    },
  });

  return incident;
}

export async function linkRunbookToEscalationPolicy(
  escalationPolicyId: string,
  runbookUrl: string
) {
  // Create a note template that gets attached to incidents
  const policy = await pdClient.escalationPolicies.get(escalationPolicyId);

  // Update policy description with runbook link
  await pdClient.escalationPolicies.update(escalationPolicyId, {
    escalation_policy: {
      ...policy,
      description: `Escalation for critical issues. Runbook: ${runbookUrl}`,
    },
  });
}

export async function triggerRemediationFromPagerDuty(
  incidentId: string,
  remediationRuleId: string
) {
  // Create acknowledgement note showing remediation was triggered
  await pdClient.incidents.notes.create(incidentId, {
    note: {
      content: `Auto-remediation triggered: ${remediationRuleId}`,
    },
  });

  // Execute remediation
  return executeRemediationRule(remediationRuleId, {
    incidentId,
  });
}

Runbook Effectiveness Metrics

Measure the impact of runbook automation:

// metrics/runbook-effectiveness.ts
interface RunbookMetrics {
  ruleId: string;
  totalIncidents: number;
  autoRemediatedSuccessfully: number;
  autoRemediationAttempted: number;
  manualResolvedAfterAutoFailed: number;
  mttrBefore: number; // seconds
  mttrAfter: number; // seconds
  mttrImprovement: number; // percentage
  userPageAvoidance: number; // percentage of incidents that didn't page
}

export async function calculateRunbookEffectiveness(
  ruleId: string,
  startDate: Date,
  endDate: Date
): Promise<RunbookMetrics> {
  // Query incident database for incidents matching this rule

  const incidents = await queryIncidents({
    ruleId,
    startDate,
    endDate,
  });

  const totalIncidents = incidents.length;

  const autoRemediatedSuccessfully = incidents.filter(
    (i) => i.status === 'auto-resolved'
  ).length;

  const autoRemediationAttempted = incidents.filter(
    (i) => i.automationAttempted === true
  ).length;

  const manualResolvedAfterAutoFailed = incidents.filter(
    (i) =>
      i.automationAttempted === true &&
      i.automationSucceeded === false &&
      i.status === 'manually-resolved'
  ).length;

  const beforeAutomation = incidents
    .filter((i) => !i.automationEnabled)
    .map((i) => i.resolutionTime)
    .reduce((a, b) => a + b, 0) / incidents.length;

  const afterAutomation = incidents
    .filter((i) => i.automationEnabled)
    .map((i) => i.resolutionTime)
    .reduce((a, b) => a + b, 0) / incidents.length;

  const mttrImprovement =
    ((beforeAutomation - afterAutomation) / beforeAutomation) * 100;

  const userPageAvoidance =
    (autoRemediatedSuccessfully / totalIncidents) * 100;

  return {
    ruleId,
    totalIncidents,
    autoRemediatedSuccessfully,
    autoRemediationAttempted,
    manualResolvedAfterAutoFailed,
    mttrBefore: beforeAutomation,
    mttrAfter: afterAutomation,
    mttrImprovement,
    userPageAvoidance,
  };
}

export async function reportRunbookEffectiveness() {
  const rules = ['high-cpu', 'high-error-rate', 'stuck-job', 'cache-full'];

  const metrics = await Promise.all(
    rules.map((ruleId) =>
      calculateRunbookEffectiveness(
        ruleId,
        new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
        new Date()
      )
    )
  );

  console.log('=== Runbook Effectiveness Report ===');
  for (const metric of metrics) {
    console.log(`\nRule: ${metric.ruleId}`);
    console.log(`  Total incidents: ${metric.totalIncidents}`);
    console.log(
      `  Auto-remediated: ${metric.autoRemediatedSuccessfully}/${metric.autoRemediationAttempted}`
    );
    console.log(
      `  Success rate: ${((metric.autoRemediatedSuccessfully / metric.autoRemediationAttempted) * 100).toFixed(1)}%`
    );
    console.log(`  MTTR before: ${metric.mttrBefore}s`);
    console.log(`  MTTR after: ${metric.mttrAfter}s`);
    console.log(`  MTTR improvement: ${metric.mttrImprovement.toFixed(1)}%`);
    console.log(
      `  User pages avoided: ${metric.userPageAvoidance.toFixed(1)}%`
    );
  }
}

Decision Tree for Auto-Remediation

// automation/remediation-decision-tree.ts
export interface DecisionNode {
  condition: string;
  threshold?: number;
  onTrue: string | DecisionNode;
  onFalse: string | DecisionNode;
}

const remediationDecisionTree: DecisionNode = {
  condition: 'error_rate > 0.01',
  onTrue: {
    condition: 'recent_deployment_in_last(30m)',
    onTrue: 'rollback_deployment',
    onFalse: {
      condition: 'cpu_utilization > 0.8',
      onTrue: 'scale_up_service',
      onFalse: {
        condition: 'database_connection_errors > 100',
        onTrue: 'page_oncall', // Database might be down
        onFalse: 'clear_cache',
      },
    },
  },
  onFalse: 'no_action', // Error rate is acceptable
};

export async function evaluateDecisionTree(
  node: DecisionNode | string,
  metrics: Record<string, number>
): Promise<string> {
  // Base case: leaf node (action)
  if (typeof node === 'string') {
    return node;
  }

  // Evaluate condition
  const conditionMet = await evaluateCondition(node.condition, metrics);

  const nextNode = conditionMet ? node.onTrue : node.onFalse;
  return evaluateDecisionTree(nextNode, metrics);
}

async function evaluateCondition(
  condition: string,
  metrics: Record<string, number>
): Promise<boolean> {
  // Parse and evaluate: "error_rate > 0.01"
  const [metric, operator, threshold] = condition.split(/\s+/);

  const value = metrics[metric];
  if (value === undefined) {
    return false;
  }

  const numThreshold = parseFloat(threshold);

  switch (operator) {
    case '>':
      return value > numThreshold;
    case '<':
      return value < numThreshold;
    case '>=':
      return value >= numThreshold;
    case '<=':
      return value <= numThreshold;
    case '==':
      return value === numThreshold;
    default:
      return false;
  }
}

Checklist

  • Document runbooks as YAML with decision trees
  • Implement auto-remediation for top 5 incident types
  • Set maximum remediation attempts (prevents infinite loops)
  • Define verify steps for each remediation action
  • Implement rollback procedures
  • Link runbooks in PagerDuty incident templates
  • Track auto-remediation success rate
  • Measure MTTR before/after automation
  • Review failed auto-remediation attempts monthly
  • Calculate toil hours eliminated by automation

Conclusion

Every time you manually run the same steps to resolve an incident, you've found automation opportunity. Start with your top three incident types: high error rate, high CPU, stuck jobs. Automate the diagnosis and first remediation attempt. Track success rate. When a remediation fails, fix the rule. Over time, your runbooks become a self-healing system. You page humans only when the decision tree is uncertain. Your MTTR drops. Your on-call sleep improves. That's the SRE promise.