Published on

Fine-Tuning vs RAG — When to Train Your Model and When to Retrieve Instead

Authors

Introduction

Fine-tuning and RAG are both ways to customize LLMs, but they solve different problems. Fine-tuning adapts a model to your style/format; RAG retrieves live data. Choosing wrong wastes money and latency. This post covers decision frameworks, cost tradeoffs, hybrid approaches, evaluation metrics (RAGAS, G-Eval), and when each wins.

Decision Framework: RAG vs Fine-Tuning

Choose based on your data freshness, query types, and cost constraints.

enum RecommendedApproach {
  RAG = 'rag',
  FINE_TUNING = 'fine_tuning',
  HYBRID = 'hybrid',
}

interface DecisionCriteria {
  dataFreshness: 'static' | 'daily' | 'hourly' | 'realtime';
  queryTypes: Array<'factual_lookup' | 'style_adaptation' | 'format_specific' | 'domain_knowledge'>;
  dataSize: number; // Number of documents
  updateFrequency: 'never' | 'monthly' | 'weekly' | 'daily' | 'continuous';
  costConstraint: number; // Monthly budget in dollars
  latencyRequirement: number; // P95 latency in ms
  accuracyRequirement: number; // 0-1, how important is correctness
}

class ApproachSelector {
  selectApproach(criteria: DecisionCriteria): {
    recommended: RecommendedApproach;
    reasoning: string[];
    estimatedMonthlyCost: number;
    estimatedLatencyMs: number;
  } {
    const reasoning: string[] = [];
    let ragScore = 0;
    let finetuneScore = 0;

    // Data freshness: RAG wins if data changes frequently
    if (['hourly', 'realtime'].includes(criteria.dataFreshness)) {
      ragScore += 3;
      reasoning.push('Data requires real-time freshness (RAG advantage)');
    } else if (criteria.dataFreshness === 'static') {
      finetuneScore += 2;
      reasoning.push('Data is static (fine-tuning viable)');
    }

    // Query types: Style/format → fine-tuning, factual → RAG
    const styleQueries = criteria.queryTypes.filter(q => ['style_adaptation', 'format_specific'].includes(q));
    const factualQueries = criteria.queryTypes.filter(q => ['factual_lookup', 'domain_knowledge'].includes(q));

    if (styleQueries.length > 0) {
      finetuneScore += styleQueries.length * 2;
      reasoning.push(`Style/format queries favor fine-tuning (${styleQueries.length} types)`);
    }

    if (factualQueries.length > 0) {
      ragScore += factualQueries.length;
      reasoning.push(`Factual queries favor RAG (${factualQueries.length} types)`);
    }

    // Data size: RAG scales better
    if (criteria.dataSize > 100000) {
      ragScore += 2;
      reasoning.push(`Large dataset (${criteria.dataSize} docs) favors RAG`);
    } else if (criteria.dataSize < 10000) {
      finetuneScore += 1;
      reasoning.push(`Small dataset (${criteria.dataSize} docs) is fine-tuning-friendly`);
    }

    // Update frequency: Frequent → RAG
    if (['daily', 'continuous'].includes(criteria.updateFrequency)) {
      ragScore += 2;
      reasoning.push('Frequent updates require RAG');
    } else if (criteria.updateFrequency === 'never') {
      finetuneScore += 2;
      reasoning.push('Static data enables fine-tuning');
    }

    // Cost: RAG cheaper for large datasets
    const ragCost = this.estimateRAGCost(criteria);
    const finetuningCost = this.estimateFinetuningCost(criteria);

    if (ragCost < criteria.costConstraint && ragCost < finetuningCost) {
      ragScore += 1;
      reasoning.push(`RAG is cheaper ($${ragCost}/month vs $${finetuningCost}/month)`);
    } else if (finetuningCost < criteria.costConstraint) {
      finetuneScore += 1;
      reasoning.push(`Fine-tuning fits budget ($${finetuningCost}/month)`);
    }

    // Latency: Fine-tuning faster (single forward pass)
    if (criteria.latencyRequirement < 500) {
      finetuneScore += 1;
      reasoning.push('Strict latency requirement favors fine-tuning');
    }

    // Decision
    let recommended: RecommendedApproach;

    if (Math.abs(ragScore - finetuneScore) < 2) {
      recommended = RecommendedApproach.HYBRID;
      reasoning.push('Similar scores suggest hybrid approach');
    } else if (ragScore > finetuneScore) {
      recommended = RecommendedApproach.RAG;
    } else {
      recommended = RecommendedApproach.FINE_TUNING;
    }

    return {
      recommended,
      reasoning,
      estimatedMonthlyCost: recommended === RecommendedApproach.RAG ? ragCost : finetuningCost,
      estimatedLatencyMs: recommended === RecommendedApproach.RAG ? 800 : 100,
    };
  }

  private estimateRAGCost(criteria: DecisionCriteria): number {
    // Costs: embedding ($0.02 per 1M tokens) + vector DB + LLM calls
    const embeddingCost = (criteria.dataSize * 500) / 1_000_000 * 0.02; // ~500 tokens per doc
    const vectorDbCost = Math.max(10, criteria.dataSize / 1_000_000 * 40); // pgvector ~$40 per 1M vectors
    const llmCostPerQuery = 0.05; // GPT-3.5 turbo + context
    const queriesPerMonth = 100000; // Estimate
    const llmCost = llmCostPerQuery * queriesPerMonth;

    return embeddingCost + vectorDbCost + llmCost;
  }

  private estimateFinetuningCost(criteria: DecisionCriteria): number {
    // Training cost + inference cost
    const trainingCost = 50; // One-time, amortized per month if doing monthly retrains
    const tokensPerMonth = 100000 * 500; // 100k queries * avg tokens
    const inferenceCost = tokensPerMonth / 1_000_000 * 0.001; // Cheaper fine-tuned model

    return trainingCost + inferenceCost;
  }
}

RAG Advantages for Live Data

RAG shines when your data changes frequently or you need citations.

interface RAGAdvantages {
  scenario: string;
  dataCharacteristic: string;
  advantage: string;
  example: string;
}

const ragWinScenarios: RAGAdvantages[] = [
  {
    scenario: 'News aggregation',
    dataCharacteristic: 'Daily/hourly updates',
    advantage: 'Can serve latest articles without retraining',
    example: 'Ask about "latest AI breakthroughs this week" → pulls fresh articles',
  },
  {
    scenario: 'Customer support',
    dataCharacteristic: 'FAQ/policies change weekly',
    advantage: 'Update knowledge base instantly without model retraining',
    example: 'New return policy → update knowledge base → users see it immediately',
  },
  {
    scenario: 'Code search',
    dataCharacteristic: 'Codebase changes daily',
    advantage: 'Search latest code without retraining on every commit',
    example: 'Search for "GraphQL resolver for user queries" → finds latest implementation',
  },
  {
    scenario: 'Research assistant',
    dataCharacteristic: 'Papers uploaded constantly',
    advantage: 'Cite exact sources and page numbers',
    example: 'Ask about "attention mechanisms" → retrieve and cite original papers',
  },
  {
    scenario: 'Legal doc analysis',
    dataCharacteristic: 'Contracts/regulations updated regularly',
    advantage: 'Always use current versions without retraining',
    example: '"What does our contract say about data retention?" → pulls exact clause',
  },
];

function shouldUseRAG(scenario: string): boolean {
  return ragWinScenarios.some(rag => rag.scenario.toLowerCase() === scenario.toLowerCase());
}

Fine-Tuning for Style/Format

Fine-tuning adapts model behavior, tone, and output format permanently.

interface FinetuningGainScenario {
  scenario: string;
  styleAdaptation: string;
  example: string;
  costBenefit: string;
}

const finetuningWinScenarios: FinetuningGainScenario[] = [
  {
    scenario: 'Customer service bot',
    styleAdaptation: 'Company voice, friendly tone, specific formatting',
    example: 'Train on 1000 customer service conversations → outputs consistent brand voice',
    costBenefit: 'Single LLM call after training (no retrieval overhead)',
  },
  {
    scenario: 'Code generation',
    styleAdaptation: 'Specific language idioms, project conventions',
    example: 'Fine-tune on React + TypeScript repo → generates matching code style',
    costBenefit: 'Fewer hallucinations with domain-specific patterns',
  },
  {
    scenario: 'Technical writer',
    styleAdaptation: 'Specific terminology, doc structure, technical depth',
    example: 'Fine-tune on company technical docs → generates matching quality',
    costBenefit: 'Consistent documentation without manual review',
  },
  {
    scenario: 'Product catalog',
    styleAdaptation: 'Structured output (JSON, CSV), specific field ordering',
    example: 'Fine-tune on product database → always outputs correct schema',
    costBenefit: 'Reliable structured output without prompt engineering',
  },
  {
    scenario: 'Summary generation',
    styleAdaptation: 'Length constraints, key points extraction',
    example: 'Fine-tune on 5000 doc + summary pairs → learns optimal summarization',
    costBenefit: 'Better compression rates than zero-shot',
  },
];

function shouldFineTune(task: string): boolean {
  return finetuningWinScenarios.some(ft => ft.scenario.toLowerCase().includes(task.toLowerCase()));
}

Hybrid Approaches

Combine both for best of both worlds: fine-tuning for style, RAG for facts.

interface HybridArchitecture {
  component: 'retrieval' | 'generation' | 'ranking';
  finetuning?: boolean;
  rag?: boolean;
  purpose: string;
}

class HybridLLMSystem {
  async processQuery(query: string): Promise<string> {
    // 1. Fine-tuned reranker (trained on your company's relevance judgments)
    const retrievedDocs = await this.ragSearch(query);

    const reranked = await this.finetuneReranker(query, retrievedDocs);

    // 2. Fine-tuned generator with RAG context
    // The generator is fine-tuned for your style, but uses retrieved context
    const contextWindow = reranked
      .map(doc => `[DOC_${doc.id}] ${doc.content}`)
      .join('\n\n');

    const response = await this.finetuneGenerator(query, contextWindow);

    return response;
  }

  private async ragSearch(query: string): Promise<Array<{ id: string; content: string; score: number }>> {
    // Standard RAG: retrieve relevant documents
    return [];
  }

  private async finetuneReranker(
    query: string,
    candidates: Array<{ id: string; content: string; score: number }>
  ): Promise<Array<{ id: string; content: string; score: number }>> {
    // Fine-tuned on your domain's relevance judgments
    // Better at understanding what matters for your use case
    return candidates;
  }

  private async finetuneGenerator(query: string, context: string): Promise<string> {
    // Fine-tuned on your company's writing style, tone, terminology
    // But grounded in retrieved context (avoids hallucinations)
    return '';
  }
}

// Example: E-commerce product assistant
class ProductAssistant {
  async answer(userQuery: string): Promise<{ response: string; confidence: number }> {
    // 1. RAG: Retrieve latest product catalog
    const products = await this.retrieveProducts(userQuery);

    // 2. Fine-tuned generation: Generate response in company brand voice
    const prompt = `
    User asks: "${userQuery}"

    Available products:
    ${products.map(p => `- ${p.name}: ${p.description}`).join('\n')}

    Generate a friendly, helpful response recommending products.
    `;

    const response = await this.generateWithFinetuning(prompt);

    return {
      response,
      confidence: 0.92, // Calibrated on validation set
    };
  }

  private async retrieveProducts(query: string): Promise<Array<{ name: string; description: string }>> {
    // RAG: Always current product catalog
    return [];
  }

  private async generateWithFinetuning(prompt: string): Promise<string> {
    // Fine-tuned on historical customer conversations
    return '';
  }
}

Evaluation Metrics: RAGAS and G-Eval

Measure quality of RAG and fine-tuned systems systematically.

// RAGAS: Retrieval-Augmented Generation Assessment Score
interface RAGASMetrics {
  faithfulness: number; // 0-1: How much does response stick to retrieved docs?
  answerRelevance: number; // 0-1: How well does response answer the query?
  contextPrecision: number; // 0-1: What fraction of retrieved docs are relevant?
  contextRecall: number; // 0-1: Did we retrieve all relevant docs?
  overallRAGAS: number; // Weighted average
}

class RAGASEvaluator {
  async evaluate(
    query: string,
    retrievedDocs: string[],
    generatedResponse: string,
    groundTruth: string
  ): Promise<RAGASMetrics> {
    const faithfulness = await this.measureFaithfulness(generatedResponse, retrievedDocs);
    const answerRelevance = await this.measureAnswerRelevance(query, generatedResponse, groundTruth);
    const contextPrecision = await this.measureContextPrecision(query, retrievedDocs, groundTruth);
    const contextRecall = await this.measureContextRecall(groundTruth, retrievedDocs);

    return {
      faithfulness,
      answerRelevance,
      contextPrecision,
      contextRecall,
      overallRAGAS: (faithfulness * 0.3 + answerRelevance * 0.3 + contextPrecision * 0.2 + contextRecall * 0.2),
    };
  }

  private async measureFaithfulness(response: string, context: string[]): Promise<number> {
    // Check if claims in response are supported by context
    // If response claims "X" and context doesn't mention X → lower score
    const contextText = context.join(' ');

    const claims = await this.extractClaims(response);
    let supportedClaims = 0;

    for (const claim of claims) {
      // Simple check: is claim mentioned in context?
      if (contextText.toLowerCase().includes(claim.toLowerCase())) {
        supportedClaims++;
      }
    }

    return claims.length === 0 ? 1.0 : supportedClaims / claims.length;
  }

  private async measureAnswerRelevance(query: string, response: string, groundTruth: string): Promise<number> {
    // Semantic similarity between response and ground truth
    // Use embeddings to compare
    return 0.85; // Placeholder
  }

  private async measureContextPrecision(
    query: string,
    retrievedDocs: string[],
    groundTruth: string
  ): Promise<number> {
    // Of all retrieved docs, how many are relevant to ground truth?
    let relevant = 0;

    for (const doc of retrievedDocs) {
      if (this.isDocumentRelevant(doc, groundTruth)) {
        relevant++;
      }
    }

    return retrievedDocs.length === 0 ? 0 : relevant / retrievedDocs.length;
  }

  private async measureContextRecall(groundTruth: string, retrievedDocs: string[]): Promise<number> {
    // Did we retrieve all the facts from ground truth?
    const truthFacts = await this.extractClaims(groundTruth);
    const docText = retrievedDocs.join(' ');

    let foundFacts = 0;

    for (const fact of truthFacts) {
      if (docText.toLowerCase().includes(fact.toLowerCase())) {
        foundFacts++;
      }
    }

    return truthFacts.length === 0 ? 1.0 : foundFacts / truthFacts.length;
  }

  private async extractClaims(text: string): Promise<string[]> {
    // Simple: split by sentences. In production, use NLP
    return text.split(/[.!?]/).map(s => s.trim()).filter(s => s.length > 5);
  }

  private isDocumentRelevant(doc: string, groundTruth: string): boolean {
    // Check semantic overlap
    return doc.toLowerCase().includes(groundTruth.toLowerCase().split(' ')[0]);
  }
}

// G-Eval: LLM-based evaluation (more nuanced)
interface GEvalMetrics {
  relevance: number; // 0-1
  coherence: number; // 0-1
  consistency: number; // 0-1
  fluency: number; // 0-1
  overallGEval: number;
}

class GEvalEvaluator {
  async evaluate(
    query: string,
    response: string,
    groundTruth: string
  ): Promise<GEvalMetrics> {
    const relevance = await this.scoreRelevance(query, response);
    const coherence = await this.scoreCoherence(response);
    const consistency = await this.scoreConsistency(response, groundTruth);
    const fluency = await this.scoreFluency(response);

    return {
      relevance,
      coherence,
      consistency,
      fluency,
      overallGEval: (relevance + coherence + consistency + fluency) / 4,
    };
  }

  private async scoreRelevance(query: string, response: string): Promise<number> {
    // Use LLM to judge: "On a scale of 1-5, how well does this response answer the query?"
    const prompt = `Query: "${query}"\nResponse: "${response}"\nRelevance score (1-5):`;

    const score = await this.callLLMForScore(prompt);
    return score / 5;
  }

  private async scoreCoherence(response: string): Promise<number> {
    // Judge: "Is this response logically coherent and well-structured?"
    const prompt = `Response: "${response}"\nCoherence score (1-5):`;
    const score = await this.callLLMForScore(prompt);
    return score / 5;
  }

  private async scoreConsistency(response: string, groundTruth: string): Promise<number> {
    // Judge: "Does this response contradict the ground truth?"
    const prompt = `Response: "${response}"\nGround truth: "${groundTruth}"\nConsistency score (1-5):`;
    const score = await this.callLLMForScore(prompt);
    return score / 5;
  }

  private async scoreFluency(response: string): Promise<number> {
    // Judge: "Is this response grammatically correct and readable?"
    const prompt = `Response: "${response}"\nFluency score (1-5):`;
    const score = await this.callLLMForScore(prompt);
    return score / 5;
  }

  private async callLLMForScore(prompt: string): Promise<number> {
    // Call LLM, extract score
    return 4; // Placeholder
  }
}

Cost Comparison Over Time

Compare total cost of RAG vs fine-tuning across different scales.

interface CostProjection {
  approach: 'rag' | 'fine_tuning';
  monthlyQueries: number;
  initialCost: number;
  monthlyRecurringCost: number;
  month12TotalCost: number;
  breakeven: string; // "Never" or "Month X"
  recommendation: string;
}

class CostComparison {
  compare(
    queryVolume: number, // Queries per month
    dataSize: number, // Documents in knowledge base
    updateFrequency: 'static' | 'monthly' | 'daily'
  ): { rag: CostProjection; fineTuning: CostProjection } {
    // RAG costs
    const ragInitial = this.calculateRAGInitial(dataSize);
    const ragMonthly = queryVolume * 0.001 + (dataSize / 1_000_000) * 40; // LLM + vector DB
    const ragYear = ragInitial + ragMonthly * 12;

    // Fine-tuning costs
    const finetuneInitial = 300; // Training
    const finetuneMonthly = queryVolume * 0.0002 + 30; // Inference + infra
    const finetuneYear = finetuneInitial + finetuneMonthly * 12;

    // If data updates frequently, fine-tuning cost increases
    const updateCost = updateFrequency === 'daily' ? 200 : updateFrequency === 'monthly' ? 50 : 0;

    return {
      rag: {
        approach: 'rag',
        monthlyQueries: queryVolume,
        initialCost: ragInitial,
        monthlyRecurringCost: ragMonthly,
        month12TotalCost: ragYear,
        breakeven: 'N/A',
        recommendation: ragYear < finetuneYear ? 'RAG is cheaper' : 'Fine-tuning is cheaper',
      },
      fineTuning: {
        approach: 'fine_tuning',
        monthlyQueries: queryVolume,
        initialCost: finetuneInitial,
        monthlyRecurringCost: finetuneMonthly + updateCost,
        month12TotalCost: finetuneYear,
        breakeven: ragYear < finetuneYear ? `Never (RAG $${ragYear} < Fine-tune $${finetuneYear})` : 'N/A',
        recommendation: finetuneYear < ragYear ? 'Fine-tuning is cheaper' : 'RAG is cheaper',
      },
    };
  }

  private calculateRAGInitial(dataSize: number): number {
    // Initial embedding generation
    return (dataSize * 500) / 1_000_000 * 0.02 + 100; // Embedding cost + setup
  }
}

// Example: 50k queries/month, 100k docs, static data
const comparison = new CostComparison().compare(50000, 100000, 'static');
console.log(`RAG: $${comparison.rag.month12TotalCost}`); // ~$2000
console.log(`Fine-tuning: $${comparison.fineTuning.month12TotalCost}`); // ~$2500

Fine-Tuning vs RAG Checklist

  • Assess data freshness requirement (static/monthly/daily/realtime)
  • Identify primary query types (factual/style/format)
  • Estimate data size and growth rate
  • Calculate cost projections for both approaches
  • Measure latency requirements
  • Define accuracy targets
  • Build evaluation harness (RAGAS + G-Eval)
  • Run A/B test if borderline
  • Plan hybrid approach if both factors are important
  • Monitor costs and quality metrics post-launch

Conclusion

Fine-tuning and RAG aren't competing—they're complementary. RAG retrieves facts, fine-tuning encodes style. Use RAG for live data and citations; use fine-tuning for consistent tone and format. For most applications, hybrid wins: RAG for knowledge, fine-tuning for personality.