Published on

AI Rate Limiting and Cost Quotas — Protecting Your LLM Budget From Runaway Usage

Authors

Introduction

Runaway LLM usage can bankrupt a feature overnight. A single malicious user, bot, or bug can generate thousands in API costs in minutes. This guide covers production quota systems that prevent bill shock while maintaining good user experience.

Per-User Token Budgets With Redis

Track and enforce token budgets per user in real-time.

import Redis from 'ioredis';

interface UserQuota {
  userId: string;
  dailyTokenLimit: number;
  tokensUsedToday: number;
  lastResetAt: Date;
  tier: 'free' | 'pro' | 'enterprise';
}

class TokenBudgetManager {
  private redis: Redis.Redis;
  private readonly quotaPrefix = 'user_quota:';
  private readonly usagePrefix = 'user_usage:';

  constructor(redisUrl: string) {
    this.redis = new Redis(redisUrl);
  }

  async setQuota(userId: string, dailyLimit: number, tier: 'free' | 'pro' | 'enterprise'): Promise<void> {
    const key = `${this.quotaPrefix}${userId}`;

    const quota: UserQuota = {
      userId,
      dailyTokenLimit: dailyLimit,
      tokensUsedToday: 0,
      lastResetAt: new Date(),
      tier,
    };

    // Store with 24-hour expiration
    await this.redis.setex(key, 86400, JSON.stringify(quota));
  }

  async canAllocateTokens(userId: string, tokensRequested: number): Promise<{ allowed: boolean; remaining: number }> {
    const key = `${this.usagePrefix}${userId}`;
    const quotaKey = `${this.quotaPrefix}${userId}`;

    const usage = await this.redis.get(key);
    const quotaData = await this.redis.get(quotaKey);

    if (!quotaData) {
      throw new Error(`No quota configured for user ${userId}`);
    }

    const quota = JSON.parse(quotaData) as UserQuota;
    const currentUsage = parseInt(usage || '0', 10);
    const remaining = quota.dailyTokenLimit - currentUsage;

    return {
      allowed: remaining >= tokensRequested,
      remaining: Math.max(0, remaining - tokensRequested),
    };
  }

  async recordTokenUsage(userId: string, tokensUsed: number): Promise<void> {
    const key = `${this.usagePrefix}${userId}`;

    // Increment and set expiration
    await this.redis.incrby(key, tokensUsed);
    await this.redis.expire(key, 86400); // Reset daily
  }

  async getUserUsage(userId: string): Promise<{ used: number; limit: number; remaining: number; percentUsed: number }> {
    const key = `${this.usagePrefix}${userId}`;
    const quotaKey = `${this.quotaPrefix}${userId}`;

    const usage = parseInt(await this.redis.get(key) || '0', 10);
    const quotaData = await this.redis.get(quotaKey);

    if (!quotaData) {
      throw new Error(`No quota configured for user ${userId}`);
    }

    const quota = JSON.parse(quotaData) as UserQuota;

    return {
      used: usage,
      limit: quota.dailyTokenLimit,
      remaining: Math.max(0, quota.dailyTokenLimit - usage),
      percentUsed: (usage / quota.dailyTokenLimit) * 100,
    };
  }

  async resetUserQuota(userId: string): Promise<void> {
    const key = `${this.usagePrefix}${userId}`;
    await this.redis.del(key);
  }
}

const budgetManager = new TokenBudgetManager(process.env.REDIS_URL!);

await budgetManager.setQuota('user123', 100000, 'pro');

const canUse = await budgetManager.canAllocateTokens('user123', 1000);
if (canUse.allowed) {
  await budgetManager.recordTokenUsage('user123', 1000);
}

const usage = await budgetManager.getUserUsage('user123');
console.log(`Used: ${usage.used}/${usage.limit} (${usage.percentUsed.toFixed(1)}%)`);

Model Tier Quotas

Allocate premium models (GPT-4) only to paying users.

interface ModelTier {
  name: string;
  models: string[];
  maxRequestsPerDay: number;
  costPerRequest: number;
}

interface UserTier {
  userId: string;
  tier: 'free' | 'pro' | 'enterprise';
  allocatedModels: string[];
  requestsPerDay: number;
  monthlyBudget: number;
}

class ModelTierQuotaManager {
  private modelTiers: Map<string, ModelTier> = new Map();
  private userTiers = new Map<string, UserTier>();

  registerModelTier(tier: ModelTier): void {
    this.modelTiers.set(tier.name, tier);
  }

  setUserTier(userTier: UserTier): void {
    this.userTiers.set(userTier.userId, userTier);
  }

  canUseModel(userId: string, model: string): { allowed: boolean; reason?: string } {
    const userTier = this.userTiers.get(userId);

    if (!userTier) {
      return { allowed: false, reason: 'User tier not configured' };
    }

    if (!userTier.allocatedModels.includes(model)) {
      return {
        allowed: false,
        reason: `Model ${model} not available for ${userTier.tier} users`,
      };
    }

    return { allowed: true };
  }

  routeToAllowedModel(userId: string, preferredModel: string): string {
    const allowed = this.canUseModel(userId, preferredModel);

    if (allowed.allowed) {
      return preferredModel;
    }

    // Fall back to free tier model
    const userTier = this.userTiers.get(userId);
    if (userTier?.allocatedModels.length) {
      return userTier.allocatedModels[0];
    }

    throw new Error('No models available for user');
  }

  getAllocatedModelsForUser(userId: string): string[] {
    return this.userTiers.get(userId)?.allocatedModels || [];
  }

  updateUserTierWithModels(userId: string, newTier: 'free' | 'pro' | 'enterprise'): void {
    const user = this.userTiers.get(userId);

    if (!user) {
      throw new Error(`User ${userId} not found`);
    }

    user.tier = newTier;

    // Update allocated models based on tier
    switch (newTier) {
      case 'free':
        user.allocatedModels = ['gpt-3.5-turbo', 'gpt-3.5-turbo-16k'];
        user.requestsPerDay = 100;
        user.monthlyBudget = 5;
        break;
      case 'pro':
        user.allocatedModels = ['gpt-3.5-turbo', 'gpt-4-turbo', 'claude-3-sonnet'];
        user.requestsPerDay = 10000;
        user.monthlyBudget = 500;
        break;
      case 'enterprise':
        user.allocatedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'claude-3-opus'];
        user.requestsPerDay = 1000000;
        user.monthlyBudget = -1; // Unlimited
        break;
    }
  }
}

const tierManager = new ModelTierQuotaManager();

tierManager.registerModelTier({
  name: 'free',
  models: ['gpt-3.5-turbo'],
  maxRequestsPerDay: 100,
  costPerRequest: 0.01,
});

tierManager.registerModelTier({
  name: 'pro',
  models: ['gpt-4-turbo', 'gpt-3.5-turbo'],
  maxRequestsPerDay: 10000,
  costPerRequest: 0.05,
});

tierManager.setUserTier({
  userId: 'user123',
  tier: 'free',
  allocatedModels: ['gpt-3.5-turbo'],
  requestsPerDay: 100,
  monthlyBudget: 5,
});

const canUse = tierManager.canUseModel('user123', 'gpt-4-turbo');
console.log('Can use GPT-4?', canUse.allowed, '-', canUse.reason);

const routed = tierManager.routeToAllowedModel('user123', 'gpt-4-turbo');
console.log('Routing to:', routed);

Request Queuing With Priority

Queue requests by priority tier to ensure paying users get priority.

interface QueuedRequest {
  id: string;
  userId: string;
  priority: number; // Higher = more important
  model: string;
  estimatedTokens: number;
  addedAt: Date;
  processedAt?: Date;
  status: 'queued' | 'processing' | 'completed' | 'failed';
}

class PriorityRequestQueue {
  private queue: QueuedRequest[] = [];
  private processing = new Map<string, QueuedRequest>();
  private maxConcurrent = 10;

  addRequest(
    userId: string,
    model: string,
    estimatedTokens: number,
    userTier: 'free' | 'pro' | 'enterprise'
  ): string {
    const priorityMap: Record<string, number> = {
      free: 1,
      pro: 5,
      enterprise: 10,
    };

    const request: QueuedRequest = {
      id: `req_${Date.now()}_${Math.random()}`,
      userId,
      priority: priorityMap[userTier],
      model,
      estimatedTokens,
      addedAt: new Date(),
      status: 'queued',
    };

    this.queue.push(request);
    this.queue.sort((a, b) => b.priority - a.priority);

    return request.id;
  }

  getNextRequest(): QueuedRequest | undefined {
    if (this.processing.size >= this.maxConcurrent) {
      return undefined;
    }

    const request = this.queue.shift();
    if (request) {
      request.status = 'processing';
      this.processing.set(request.id, request);
    }

    return request;
  }

  completeRequest(requestId: string): void {
    const request = this.processing.get(requestId);
    if (request) {
      request.status = 'completed';
      request.processedAt = new Date();
      this.processing.delete(requestId);
    }
  }

  failRequest(requestId: string): void {
    const request = this.processing.get(requestId);
    if (request) {
      request.status = 'failed';
      this.processing.delete(requestId);
    }
  }

  getQueueStats(): {
    queuedCount: number;
    processingCount: number;
    avgWaitTimeMs: number;
  } {
    const now = Date.now();
    const waitTimes = this.queue.map((r) => now - r.addedAt.getTime());
    const avgWaitTime = waitTimes.length > 0 ? waitTimes.reduce((a, b) => a + b, 0) / waitTimes.length : 0;

    return {
      queuedCount: this.queue.length,
      processingCount: this.processing.size,
      avgWaitTimeMs: Math.round(avgWaitTime),
    };
  }
}

const queue = new PriorityRequestQueue();

queue.addRequest('free_user', 'gpt-3.5-turbo', 500, 'free');
queue.addRequest('pro_user', 'gpt-4', 1000, 'pro');
queue.addRequest('enterprise_user', 'gpt-4', 2000, 'enterprise');

const next = queue.getNextRequest();
console.log('Next request from:', next?.userId, '(priority:', next?.priority, ')');

const stats = queue.getQueueStats();
console.log('Queue stats:', stats);

Hard Cutoffs vs Soft Warnings

Implement graceful quota enforcement with warnings and hard limits.

interface QuotaResponse {
  allowed: boolean;
  warningLevel: 'none' | 'warning' | 'critical';
  remaining: number;
  message?: string;
}

class QuotaEnforcer {
  private warningThreshold = 0.8; // Warn at 80%
  private hardCutoff = 0.98; // Block at 98%
  private softCutoffMessage = 'You are approaching your daily limit. Requests will be limited.';
  private hardCutoffMessage = 'Daily token limit reached. Please try again tomorrow or upgrade your plan.';

  checkQuota(used: number, limit: number, strict: boolean = false): QuotaResponse {
    const utilization = used / limit;

    if (utilization >= this.hardCutoff) {
      return {
        allowed: false,
        warningLevel: 'critical',
        remaining: Math.max(0, limit - used),
        message: this.hardCutoffMessage,
      };
    }

    if (utilization >= this.warningThreshold) {
      return {
        allowed: true,
        warningLevel: 'warning',
        remaining: Math.max(0, limit - used),
        message: this.softCutoffMessage,
      };
    }

    return {
      allowed: true,
      warningLevel: 'none',
      remaining: Math.max(0, limit - used),
    };
  }

  configureThresholds(warningPercent: number, hardPercent: number): void {
    this.warningThreshold = warningPercent / 100;
    this.hardCutoff = hardPercent / 100;
  }

  recommendedAction(response: QuotaResponse): 'proceed' | 'warn' | 'deny' | 'suggest_upgrade' {
    if (!response.allowed && response.warningLevel === 'critical') {
      return 'deny';
    }

    if (response.warningLevel === 'warning') {
      return 'suggest_upgrade';
    }

    if (response.warningLevel === 'critical') {
      return 'deny';
    }

    return 'proceed';
  }
}

const enforcer = new QuotaEnforcer();

const check1 = enforcer.checkQuota(75000, 100000);
console.log('Check at 75%:', check1);

const check2 = enforcer.checkQuota(98000, 100000);
console.log('Check at 98%:', check2);

const action = enforcer.recommendedAction(check2);
console.log('Recommended action:', action);

Cost Attribution Per Feature/User/Tenant

Track spending across dimensions for chargeback and optimization.

interface CostEvent {
  timestamp: Date;
  featureId: string;
  userId: string;
  tenantId: string;
  model: string;
  inputTokens: number;
  outputTokens: number;
  cost: number;
}

class CostAttributor {
  private events: CostEvent[] = [];
  private modelPricing: Record<string, { input: number; output: number }> = {
    'gpt-4': { input: 0.00003, output: 0.00006 },
    'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
    'claude-3-opus': { input: 0.000015, output: 0.000075 },
  };

  recordCost(
    featureId: string,
    userId: string,
    tenantId: string,
    model: string,
    inputTokens: number,
    outputTokens: number
  ): void {
    const pricing = this.modelPricing[model] || { input: 0, output: 0 };
    const cost = inputTokens * pricing.input + outputTokens * pricing.output;

    this.events.push({
      timestamp: new Date(),
      featureId,
      userId,
      tenantId,
      model,
      inputTokens,
      outputTokens,
      cost,
    });
  }

  getCostsByFeature(timeframeMs: number = 86400000): Record<string, number> {
    const cutoff = new Date(Date.now() - timeframeMs);
    const costs: Record<string, number> = {};

    for (const event of this.events) {
      if (event.timestamp >= cutoff) {
        costs[event.featureId] = (costs[event.featureId] || 0) + event.cost;
      }
    }

    return costs;
  }

  getCostsByUser(timeframeMs: number = 86400000): Record<string, number> {
    const cutoff = new Date(Date.now() - timeframeMs);
    const costs: Record<string, number> = {};

    for (const event of this.events) {
      if (event.timestamp >= cutoff) {
        costs[event.userId] = (costs[event.userId] || 0) + event.cost;
      }
    }

    return costs;
  }

  getCostsByTenant(timeframeMs: number = 2592000000): Record<string, number> {
    const cutoff = new Date(Date.now() - timeframeMs);
    const costs: Record<string, number> = {};

    for (const event of this.events) {
      if (event.timestamp >= cutoff) {
        costs[event.tenantId] = (costs[event.tenantId] || 0) + event.cost;
      }
    }

    return costs;
  }

  getDetailedAnalysis(tenantId: string, days: number = 30): {
    totalCost: number;
    byFeature: Record<string, number>;
    byModel: Record<string, number>;
    byUser: Record<string, number>;
  } {
    const cutoff = new Date(Date.now() - days * 86400000);
    const relevant = this.events.filter((e) => e.tenantId === tenantId && e.timestamp >= cutoff);

    const byFeature: Record<string, number> = {};
    const byModel: Record<string, number> = {};
    const byUser: Record<string, number> = {};
    let totalCost = 0;

    for (const event of relevant) {
      totalCost += event.cost;
      byFeature[event.featureId] = (byFeature[event.featureId] || 0) + event.cost;
      byModel[event.model] = (byModel[event.model] || 0) + event.cost;
      byUser[event.userId] = (byUser[event.userId] || 0) + event.cost;
    }

    return { totalCost, byFeature, byModel, byUser };
  }
}

const attributor = new CostAttributor();

attributor.recordCost('search', 'user1', 'tenant1', 'gpt-4', 500, 300);
attributor.recordCost('chat', 'user2', 'tenant1', 'gpt-3.5-turbo', 100, 200);

const costByFeature = attributor.getCostsByFeature();
console.log('Costs by feature:', costByFeature);

const analysis = attributor.getDetailedAnalysis('tenant1');
console.log('Detailed analysis:', analysis);

Real-Time Spend Dashboard

Track spending in real-time with cost breakdowns.

interface SpendSnapshot {
  timestamp: Date;
  totalSpend: number;
  spendRate: number; // dollars per minute
  projectedDaily: number;
  projectedMonthly: number;
  budgetUtilization: number; // 0-100%
  alerts: string[];
}

class SpendDashboard {
  private dailyBudget = 1000;
  private monthlyBudget = 20000;
  private spendEvents: Array<{ timestamp: Date; amount: number }> = [];
  private alerts: string[] = [];

  recordSpend(amount: number): void {
    this.spendEvents.push({ timestamp: new Date(), amount });
  }

  getSnapshot(): SpendSnapshot {
    const now = new Date();
    const todayStart = new Date(now.getFullYear(), now.getMonth(), now.getDate());
    const monthStart = new Date(now.getFullYear(), now.getMonth(), 1);

    // Calculate daily spend
    const todaySpends = this.spendEvents.filter((e) => e.timestamp >= todayStart);
    const todayTotal = todaySpends.reduce((sum, e) => sum + e.amount, 0);

    // Calculate spend rate
    const hoursElapsedToday = (Date.now() - todayStart.getTime()) / (1000 * 60 * 60);
    const spendRate = todayTotal / hoursElapsedToday / 60; // $/min

    // Project spending
    const remainingHoursInDay = 24 - hoursElapsedToday;
    const projectedDaily = todayTotal + spendRate * 60 * remainingHoursInDay;

    const remainingDaysInMonth = new Date(
      now.getFullYear(),
      now.getMonth() + 1,
      0
    ).getDate() - now.getDate();
    const projectedMonthly =
      this.spendEvents.filter((e) => e.timestamp >= monthStart).reduce((sum, e) => sum + e.amount, 0) +
      spendRate * 60 * 24 * remainingDaysInMonth;

    // Check alerts
    const newAlerts: string[] = [];
    if (projectedDaily > this.dailyBudget * 0.8) {
      newAlerts.push('Daily spend projected to exceed 80% of budget');
    }
    if (todayTotal > this.dailyBudget) {
      newAlerts.push('Daily budget exceeded!');
    }
    if (projectedMonthly > this.monthlyBudget) {
      newAlerts.push('Monthly spend projected to exceed budget');
    }

    this.alerts = newAlerts;

    return {
      timestamp: new Date(),
      totalSpend: todayTotal,
      spendRate: spendRate * 60, // $/hour
      projectedDaily,
      projectedMonthly,
      budgetUtilization: (todayTotal / this.dailyBudget) * 100,
      alerts: newAlerts,
    };
  }

  setBudgets(daily: number, monthly: number): void {
    this.dailyBudget = daily;
    this.monthlyBudget = monthly;
  }

  getAlerts(): string[] {
    return this.alerts;
  }
}

const dashboard = new SpendDashboard();
dashboard.setBudgets(1000, 20000);

dashboard.recordSpend(50);
dashboard.recordSpend(75);
dashboard.recordSpend(25);

const snapshot = dashboard.getSnapshot();
console.log('Spend snapshot:', snapshot);

Anomaly Detection for Unusual Token Spikes

Detect and alert on unexpected usage patterns.

class AnomalyDetector {
  private history: Array<{ timestamp: Date; tokens: number }> = [];
  private windowSize = 24; // hours
  private stdDevThreshold = 2.5; // Alert if > 2.5 std devs

  recordTokenUsage(tokens: number): void {
    this.history.push({ timestamp: new Date(), tokens });
  }

  detectAnomaly(): { isAnomaly: boolean; severity: 'low' | 'medium' | 'high'; reason?: string } {
    if (this.history.length < 10) {
      return { isAnomaly: false, severity: 'low' };
    }

    const now = Date.now();
    const windowStart = now - this.windowSize * 3600000;

    const recent = this.history.filter((h) => h.timestamp.getTime() >= windowStart);

    if (recent.length === 0) {
      return { isAnomaly: false, severity: 'low' };
    }

    const tokens = recent.map((r) => r.tokens);
    const mean = tokens.reduce((a, b) => a + b, 0) / tokens.length;
    const variance = tokens.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / tokens.length;
    const stdDev = Math.sqrt(variance);

    const latest = tokens[tokens.length - 1];
    const zScore = Math.abs((latest - mean) / stdDev);

    if (zScore > this.stdDevThreshold) {
      const severity = zScore > 4 ? 'high' : zScore > 3 ? 'medium' : 'low';
      return {
        isAnomaly: true,
        severity,
        reason: `Token spike: ${latest} tokens (${zScore.toFixed(1)} std devs above mean of ${mean.toFixed(0)})`,
      };
    }

    return { isAnomaly: false, severity: 'low' };
  }

  getBaseline(): { mean: number; stdDev: number } {
    if (this.history.length === 0) {
      return { mean: 0, stdDev: 0 };
    }

    const tokens = this.history.map((h) => h.tokens);
    const mean = tokens.reduce((a, b) => a + b, 0) / tokens.length;
    const variance = tokens.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / tokens.length;
    const stdDev = Math.sqrt(variance);

    return { mean, stdDev };
  }
}

const detector = new AnomalyDetector();

detector.recordTokenUsage(1000);
detector.recordTokenUsage(1100);
detector.recordTokenUsage(950);
detector.recordTokenUsage(1050);
detector.recordTokenUsage(50000); // Anomaly!

const anomaly = detector.detectAnomaly();
console.log('Anomaly detected:', anomaly.isAnomaly, '-', anomaly.reason);

const baseline = detector.getBaseline();
console.log('Baseline:', baseline);

Graceful Degradation Messaging

Provide friendly messages when limits are reached.

class GracefulDegradationManager {
  private messages: Record<string, string> = {
    quota_exceeded: 'You have reached your daily token limit. Please try again tomorrow.',
    quota_warning: 'You are using {{percent}}% of your daily quota. Consider upgrading for more access.',
    model_unavailable: 'This model is not available for {{tier}} users. Try {{fallback_model}} instead.',
    queue_full: 'The system is at capacity. Your request is queued and will be processed shortly.',
    rate_limited: 'Please slow down. You are making requests too quickly. Try again in {{wait_time}}s.',
  };

  getMessage(
    messageType: string,
    variables?: Record<string, unknown>
  ): string {
    let message = this.messages[messageType] || 'An error occurred. Please try again.';

    if (variables) {
      for (const [key, value] of Object.entries(variables)) {
        message = message.replace(`{{${key}}}`, String(value));
      }
    }

    return message;
  }

  getUpgradePrompt(currentTier: string, nextTier: string): string {
    return `Upgrade to ${nextTier} for {{benefits}}.`;
  }

  getHTTPResponse(statusCode: number, reason: 'quota' | 'rate_limit' | 'model_unavailable'): {
    statusCode: number;
    message: string;
    retryAfter?: number;
  } {
    switch (reason) {
      case 'quota':
        return {
          statusCode: 429,
          message: this.getMessage('quota_exceeded'),
          retryAfter: 86400, // 24 hours
        };
      case 'rate_limit':
        return {
          statusCode: 429,
          message: this.getMessage('rate_limited', { wait_time: 60 }),
          retryAfter: 60,
        };
      case 'model_unavailable':
        return {
          statusCode: 400,
          message: this.getMessage('model_unavailable', { tier: 'free', fallback_model: 'gpt-3.5-turbo' }),
        };
      default:
        return {
          statusCode: 500,
          message: 'Internal server error',
        };
    }
  }
}

const degradation = new GracefulDegradationManager();

const quotaMsg = degradation.getMessage('quota_warning', { percent: 85 });
console.log(quotaMsg);

const response = degradation.getHTTPResponse(429, 'quota');
console.log('HTTP Response:', response);

Checklist

  • Track daily and monthly token budgets per user in Redis
  • Allocate expensive models (GPT-4) only to paying tiers
  • Implement priority queuing with free/pro/enterprise weights
  • Set warning threshold at 80% utilization, hard cutoff at 98%
  • Attribute costs per feature, user, and tenant for chargeback
  • Build real-time spend dashboard with projections
  • Alert on projected spend exceeding daily/monthly budget
  • Detect usage anomalies using statistical baselines
  • Provide friendly degradation messages, not cryptic errors
  • Enable users to upgrade on the fly when hitting limits
  • Log all quota events for audit trails
  • Review top-cost features weekly for optimization

Conclusion

Quota systems protect revenue and user experience simultaneously. Start with simple per-user token budgets in Redis, add tiered model access based on plan, then implement priority queuing, cost attribution, and anomaly detection as you scale. This layered approach prevents bill shock while ensuring paying customers get priority access.