Published on

LLM Observability in Production — Tracing, Evaluating, and Debugging AI Features

Authors

Introduction

LLM outputs are probabilistic. Without observability, you cannot detect quality regressions, identify failure patterns, or optimize performance. This guide covers production observability strategies used by scaled AI companies.

LangSmith and Langfuse Instrumentation

Set up centralized tracing for all LLM calls with detailed span information.

import { LangSmith } from 'langsmith';
import { Langfuse } from 'langfuse';

class ObservableLLMClient {
  private langsmith: LangSmith;
  private langfuse: Langfuse;

  constructor(
    langsmithKey: string,
    langfuseKey: string,
    langfusePublicKey: string
  ) {
    this.langsmith = new LangSmith({ apiKey: langsmithKey });
    this.langfuse = new Langfuse({ secretKey: langfuseKey, publicKey: langfusePublicKey });
  }

  async traceCompletion(
    model: string,
    messages: Array<{ role: string; content: string }>,
    userId: string,
    feature: string
  ): Promise<string> {
    const trace = this.langfuse.trace({
      name: `llm_completion_${feature}`,
      input: { model, messageCount: messages.length },
      userId,
    });

    const generation = trace.span({
      name: 'api_call',
      input: { messages },
      metadata: { model, feature, userId },
    });

    try {
      const startTime = Date.now();

      const response = await fetch('https://api.openai.com/v1/chat/completions', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
          Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
        },
        body: JSON.stringify({
          model,
          messages,
          temperature: 0.7,
        }),
      });

      const data = (await response.json()) as {
        choices: Array<{ message: { content: string } }>;
        usage: { prompt_tokens: number; completion_tokens: number };
      };

      const latencyMs = Date.now() - startTime;
      const content = data.choices[0].message.content;

      generation.end({
        output: content,
        metadata: {
          promptTokens: data.usage.prompt_tokens,
          completionTokens: data.usage.completion_tokens,
          latencyMs,
        },
      });

      trace.update({
        output: { content, success: true },
        metadata: { totalLatencyMs: latencyMs },
      });

      return content;
    } catch (error) {
      generation.end({
        output: null,
        level: 'ERROR',
        statusMessage: error instanceof Error ? error.message : String(error),
      });

      throw error;
    }
  }
}

const client = new ObservableLLMClient(
  process.env.LANGSMITH_API_KEY!,
  process.env.LANGFUSE_SECRET!,
  process.env.LANGFUSE_PUBLIC!
);

const result = await client.traceCompletion(
  'gpt-4-turbo',
  [{ role: 'user', content: 'Summarize this document...' }],
  'user123',
  'document_summarization'
);

Span Instrumentation for Chains

Break down multi-step AI workflows into measurable spans.

class ChainTracer {
  private spans: Map<string, { startTime: number; data: Record<string, unknown> }> = new Map();

  startSpan(spanId: string, name: string, metadata: Record<string, unknown> = {}): void {
    this.spans.set(spanId, {
      startTime: Date.now(),
      data: { name, metadata, events: [] },
    });
  }

  addEvent(spanId: string, eventName: string, data: Record<string, unknown> = {}): void {
    const span = this.spans.get(spanId);
    if (!span) return;

    (span.data.events as Array<{ name: string; data: Record<string, unknown>; timestamp: number }> = span.data.events || []).push({
      name: eventName,
      data,
      timestamp: Date.now(),
    });
  }

  endSpan(spanId: string, status: 'success' | 'error' = 'success', output: unknown = null): Record<string, unknown> {
    const span = this.spans.get(spanId);
    if (!span) return {};

    const duration = Date.now() - span.startTime;
    const record = {
      ...span.data,
      duration,
      status,
      output,
      timestamp: new Date().toISOString(),
    };

    this.spans.delete(spanId);
    console.log(`[SPAN] ${record.name} completed in ${duration}ms`);

    return record;
  }
}

async function tracedAIChain(input: string): Promise<void> {
  const tracer = new ChainTracer();
  const chainId = `chain_${Date.now()}`;

  tracer.startSpan(`${chainId}_preprocess`, 'Preprocessing', { inputLength: input.length });
  const preprocessed = input.toLowerCase().trim();
  tracer.addEvent(`${chainId}_preprocess`, 'cleaned_input', { length: preprocessed.length });
  tracer.endSpan(`${chainId}_preprocess`, 'success', preprocessed);

  tracer.startSpan(`${chainId}_embed`, 'Embedding', { text: preprocessed });
  // Simulate embedding API call
  await new Promise((resolve) => setTimeout(resolve, 100));
  const embedding = Array(384).fill(0).map(() => Math.random());
  tracer.endSpan(`${chainId}_embed`, 'success', { dimensions: embedding.length });

  tracer.startSpan(`${chainId}_retrieval`, 'Vector Search', { embeddingDims: embedding.length });
  const matches = [{ id: 'doc1', score: 0.95 }];
  tracer.endSpan(`${chainId}_retrieval`, 'success', { matchCount: matches.length });

  tracer.startSpan(`${chainId}_llm`, 'LLM Generation', { contextDocs: matches.length });
  const result = 'Generated response based on retrieved context...';
  tracer.endSpan(`${chainId}_llm`, 'success', result);
}

await tracedAIChain('Sample input for processing');

LLM-as-Judge Evaluation

Use an LLM to evaluate the quality of another LLM's outputs against criteria.

interface EvaluationResult {
  score: number;
  reasoning: string;
  passed: boolean;
}

class LLMJudge {
  async evaluate(
    output: string,
    criteria: string,
    examples?: Array<{ output: string; score: number }>
  ): Promise<EvaluationResult> {
    const examplePrompt = examples
      ? `Examples:\n${examples.map((ex) => `Output: ${ex.output}\nScore: ${ex.score}/10`).join('\n\n')}\n\n`
      : '';

    const prompt = `${examplePrompt}Evaluate this output based on the criteria: ${criteria}\n\nOutput: "${output}"\n\nRespond with JSON: { "score": <0-10>, "reasoning": "<explanation>" }`;

    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({
        model: 'gpt-4-turbo',
        messages: [{ role: 'user', content: prompt }],
        temperature: 0,
      }),
    });

    const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
    const result = JSON.parse(data.choices[0].message.content);

    return {
      score: result.score,
      reasoning: result.reasoning,
      passed: result.score >= 7,
    };
  }

  async evaluateBatch(
    outputs: string[],
    criteria: string
  ): Promise<Array<{ output: string; evaluation: EvaluationResult }>> {
    const results = await Promise.all(outputs.map((out) => this.evaluate(out, criteria)));
    return outputs.map((output, i) => ({ output, evaluation: results[i] }));
  }
}

const judge = new LLMJudge();
const evaluation = await judge.evaluate(
  'The capital of France is Paris, located on the Seine River.',
  'Is the response factually accurate and relevant?'
);

console.log(`Score: ${evaluation.score}/10 - ${evaluation.reasoning}`);

Cosine Similarity and Exact Match Metrics

Evaluate semantic similarity and exact correctness.

class EvaluationMetrics {
  cosineSimilarity(vecA: number[], vecB: number[]): number {
    if (vecA.length !== vecB.length) {
      throw new Error('Vectors must have same dimensions');
    }

    const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
    const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
    const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));

    return dotProduct / (magnitudeA * magnitudeB);
  }

  exactMatch(predicted: string, expected: string): boolean {
    return predicted.trim().toLowerCase() === expected.trim().toLowerCase();
  }

  partialMatch(predicted: string, expected: string): number {
    const predWords = predicted.toLowerCase().split(/\s+/);
    const expectedWords = expected.toLowerCase().split(/\s+/);

    const matches = predWords.filter((word) => expectedWords.includes(word)).length;
    const total = Math.max(predWords.length, expectedWords.length);

    return matches / total;
  }

  recallAtK(predictions: string[], goldStandard: string[], k: number = 5): number {
    const topK = predictions.slice(0, k);
    const matches = topK.filter((pred) => goldStandard.includes(pred)).length;
    return matches / Math.min(k, goldStandard.length);
  }

  meanReciprocalRank(predictions: string[], goldStandard: string[]): number {
    for (let i = 0; i < predictions.length; i++) {
      if (goldStandard.includes(predictions[i])) {
        return 1 / (i + 1);
      }
    }
    return 0;
  }
}

const metrics = new EvaluationMetrics();

const vec1 = [1, 0, 1, 0];
const vec2 = [1, 0, 0, 1];
console.log(`Cosine similarity: ${metrics.cosineSimilarity(vec1, vec2)}`);

console.log(`Exact match: ${metrics.exactMatch('Paris', 'paris')}`);
console.log(`Partial match: ${metrics.partialMatch('The capital is Paris', 'Paris city')}`);
console.log(`Recall@5: ${metrics.recallAtK(['a', 'b', 'c', 'd', 'e'], ['c', 'd', 'f'], 5)}`);
console.log(`MRR: ${metrics.meanReciprocalRank(['a', 'b', 'c'], ['c', 'd'])}`);

Golden Dataset Maintenance

Build and maintain a golden dataset for continuous evaluation.

interface GoldenExample {
  id: string;
  input: string;
  expectedOutput: string;
  category: string;
  createdAt: Date;
  reviewedBy: string;
}

class GoldenDataset {
  private examples: GoldenExample[] = [];
  private filename = 'golden_dataset.json';

  addExample(input: string, expectedOutput: string, category: string, reviewer: string): void {
    this.examples.push({
      id: `golden_${Date.now()}_${Math.random()}`,
      input,
      expectedOutput,
      category,
      createdAt: new Date(),
      reviewedBy: reviewer,
    });
  }

  getByCategory(category: string): GoldenExample[] {
    return this.examples.filter((ex) => ex.category === category);
  }

  async evaluateAgainstGolden(
    model: (input: string) => Promise<string>
  ): Promise<{ passed: number; failed: number; results: Array<{ example: GoldenExample; actual: string; match: boolean }> }> {
    const results = [];
    let passed = 0;
    let failed = 0;

    for (const example of this.examples) {
      const actual = await model(example.input);
      const match = actual.trim().toLowerCase() === example.expectedOutput.trim().toLowerCase();

      results.push({ example, actual, match });
      if (match) passed++;
      else failed++;
    }

    return { passed, failed, results };
  }

  exportForBackup(): string {
    return JSON.stringify(this.examples, null, 2);
  }
}

const goldenSet = new GoldenDataset();
goldenSet.addExample('What is 2+2?', '4', 'math', 'reviewer@example.com');
goldenSet.addExample('What is the capital of France?', 'Paris', 'geography', 'reviewer@example.com');

const mockModel = async (input: string): Promise<string> => {
  if (input.includes('2+2')) return '4';
  if (input.includes('capital')) return 'Paris';
  return 'Unknown';
};

const evaluation = await goldenSet.evaluateAgainstGolden(mockModel);
console.log(`Passed: ${evaluation.passed}, Failed: ${evaluation.failed}`);

Latency Percentiles for AI Endpoints

Monitor response time distribution to catch performance degradation.

class LatencyMonitor {
  private latencies: number[] = [];

  recordLatency(ms: number): void {
    this.latencies.push(ms);
  }

  getPercentile(p: number): number {
    if (this.latencies.length === 0) return 0;

    const sorted = [...this.latencies].sort((a, b) => a - b);
    const index = Math.ceil((p / 100) * sorted.length) - 1;
    return sorted[Math.max(0, index)];
  }

  getStats(): {
    p50: number;
    p95: number;
    p99: number;
    mean: number;
    stdDev: number;
  } {
    if (this.latencies.length === 0) {
      return { p50: 0, p95: 0, p99: 0, mean: 0, stdDev: 0 };
    }

    const mean = this.latencies.reduce((a, b) => a + b, 0) / this.latencies.length;
    const variance = this.latencies.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / this.latencies.length;
    const stdDev = Math.sqrt(variance);

    return {
      p50: this.getPercentile(50),
      p95: this.getPercentile(95),
      p99: this.getPercentile(99),
      mean,
      stdDev,
    };
  }

  reset(): void {
    this.latencies = [];
  }
}

const monitor = new LatencyMonitor();
[120, 135, 145, 150, 180, 200, 220, 250, 300, 500].forEach((latency) => {
  monitor.recordLatency(latency);
});

const stats = monitor.getStats();
console.log(`P50: ${stats.p50}ms, P95: ${stats.p95}ms, P99: ${stats.p99}ms`);

Token Usage Dashboards

Track token consumption by feature, model, and user.

interface TokenUsageRecord {
  feature: string;
  model: string;
  userId: string;
  inputTokens: number;
  outputTokens: number;
  timestamp: Date;
}

class TokenDashboard {
  private records: TokenUsageRecord[] = [];

  recordUsage(
    feature: string,
    model: string,
    userId: string,
    inputTokens: number,
    outputTokens: number
  ): void {
    this.records.push({
      feature,
      model,
      userId,
      inputTokens,
      outputTokens,
      timestamp: new Date(),
    });
  }

  getFeatureUsage(timeframeMs: number = 86400000): Record<string, { input: number; output: number }> {
    const cutoff = new Date(Date.now() - timeframeMs);
    const usage: Record<string, { input: number; output: number }> = {};

    for (const record of this.records) {
      if (record.timestamp >= cutoff) {
        if (!usage[record.feature]) {
          usage[record.feature] = { input: 0, output: 0 };
        }
        usage[record.feature].input += record.inputTokens;
        usage[record.feature].output += record.outputTokens;
      }
    }

    return usage;
  }

  getModelComparison(): Record<string, { totalTokens: number; usageCount: number }> {
    const comparison: Record<string, { totalTokens: number; usageCount: number }> = {};

    for (const record of this.records) {
      if (!comparison[record.model]) {
        comparison[record.model] = { totalTokens: 0, usageCount: 0 };
      }
      comparison[record.model].totalTokens += record.inputTokens + record.outputTokens;
      comparison[record.model].usageCount += 1;
    }

    return comparison;
  }

  getTopUsers(limit: number = 10): Array<{ userId: string; totalTokens: number }> {
    const userUsage: Record<string, number> = {};

    for (const record of this.records) {
      userUsage[record.userId] = (userUsage[record.userId] || 0) + record.inputTokens + record.outputTokens;
    }

    return Object.entries(userUsage)
      .map(([userId, totalTokens]) => ({ userId, totalTokens }))
      .sort((a, b) => b.totalTokens - a.totalTokens)
      .slice(0, limit);
  }
}

const dashboard = new TokenDashboard();
dashboard.recordUsage('summarize', 'gpt-4', 'user1', 150, 200);
dashboard.recordUsage('classify', 'gpt-3.5-turbo', 'user2', 50, 75);

console.log('Feature usage:', dashboard.getFeatureUsage());
console.log('Top users:', dashboard.getTopUsers());

Regression Detection and Model Migration Testing

Detect quality drops when swapping models.

class RegressionDetector {
  private baselineMetrics: Record<string, number> = {};

  setBaseline(model: string, metrics: Record<string, number>): void {
    this.baselineMetrics[model] = metrics.successRate || 0;
  }

  detectRegression(model: string, currentMetrics: Record<string, number>, threshold: number = 0.05): {
    regressed: boolean;
    change: number;
    message: string;
  } {
    const baseline = this.baselineMetrics[model] || 1.0;
    const current = currentMetrics.successRate || 0;
    const change = (baseline - current) / baseline;

    return {
      regressed: change > threshold,
      change: change * 100,
      message: change > threshold ? `REGRESSION: ${(change * 100).toFixed(2)}% drop` : 'No regression detected',
    };
  }

  testModelMigration(
    oldModel: string,
    newModel: string,
    testCases: Array<{ input: string; expected: string }>,
    evaluator: (output: string, expected: string) => boolean
  ): { oldScore: number; newScore: number; safe: boolean } {
    let oldScore = 0;
    let newScore = 0;

    for (const testCase of testCases) {
      // Simulate old model performance
      const oldOutput = `old_${testCase.input}`;
      if (evaluator(oldOutput, testCase.expected)) oldScore++;

      // Simulate new model performance
      const newOutput = `new_${testCase.input}`;
      if (evaluator(newOutput, testCase.expected)) newScore++;
    }

    const oldRate = oldScore / testCases.length;
    const newRate = newScore / testCases.length;
    const safe = newRate >= oldRate * 0.95;

    return { oldScore: oldRate, newScore: newRate, safe };
  }
}

const detector = new RegressionDetector();
detector.setBaseline('gpt-4', { successRate: 0.95 });

const regression = detector.detectRegression('gpt-4', { successRate: 0.89 });
console.log(regression.message);

Checklist

  • Instrument all LLM calls with LangSmith or Langfuse
  • Break down chains into measurable spans with events
  • Use LLM-as-judge for subjective quality evaluation
  • Maintain golden datasets with reviewed examples
  • Monitor p50, p95, p99 latencies weekly
  • Create feature-level token usage dashboards
  • Set up regression detection alerts for model changes
  • Test model migrations against golden datasets before deployment
  • Build cost vs. quality tradeoff dashboards
  • Automate eval runs on every model or prompt change

Conclusion

LLM observability is the foundation of reliable AI systems. Start with basic tracing in LangSmith, add LLM-as-judge evaluations, and maintain golden datasets. As you scale, add latency monitoring, token dashboards, and automated regression detection. This layered approach catches quality regressions early and provides data-driven confidence for model migrations.