Published on

Managing LLM Context Windows — When Your Conversation Is Too Long

Authors

Introduction

Context windows define how much text an LLM can see. GPT-4o allows 128K tokens. A multi-turn conversation spanning weeks will exceed this. This guide shows production-tested patterns to handle long contexts without losing information.

Context Window Limits Across Models

Different models have different limits. Choosing the right model for your use case is critical.

interface ModelCapabilities {
  model: string;
  max_tokens: number;
  max_output: number;
  training_cutoff: string;
  cost_per_1k_input: number;
}

class ContextWindowRegistry {
  private models: Record<string, ModelCapabilities> = {
    'gpt-4o': {
      model: 'gpt-4o',
      max_tokens: 128000,
      max_output: 4096,
      training_cutoff: '2024-04',
      cost_per_1k_input: 0.015
    },
    'gpt-3.5-turbo': {
      model: 'gpt-3.5-turbo',
      max_tokens: 16384,
      max_output: 4096,
      training_cutoff: '2024-01',
      cost_per_1k_input: 0.0005
    },
    'claude-3-opus': {
      model: 'claude-3-opus',
      max_tokens: 200000,
      max_output: 4096,
      training_cutoff: '2024-08',
      cost_per_1k_input: 0.015
    },
    'gemini-2.0-pro': {
      model: 'gemini-2.0-pro',
      max_tokens: 100000,
      max_output: 4096,
      training_cutoff: '2024-09',
      cost_per_1k_input: 0.01
    }
  };

  getModel(name: string): ModelCapabilities {
    const model = this.models[name];
    if (!model) throw new Error(`Unknown model: ${name}`);
    return model;
  }

  selectModelForContext(estimatedTokens: number, priority: 'cost' | 'capability'): string {
    const candidates = Object.values(this.models)
      .filter(m => m.max_tokens &gt;= estimatedTokens);

    if (candidates.length === 0) {
      throw new Error(`No model supports ${estimatedTokens} tokens`);
    }

    if (priority === 'cost') {
      return candidates.reduce((a, b) => a.cost_per_1k_input &lt; b.cost_per_1k_input ? a : b).model;
    }
    return candidates.reduce((a, b) => a.max_tokens &gt; b.max_tokens ? a : b).model;
  }
}

Sliding Window for Long Chats

Keep only the most recent N messages plus system prompt. Older context is dropped.

interface Message {
  id: string;
  role: 'user' | 'assistant' | 'system';
  content: string;
  tokens: number;
  timestamp: Date;
}

class SlidingWindowManager {
  private window: Message[] = [];
  private maxWindowTokens: number = 100000;
  private encoder: any;

  constructor(encoder: any, maxTokens: number = 100000) {
    this.encoder = encoder;
    this.maxWindowTokens = maxTokens;
  }

  addMessage(message: Message): void {
    this.window.push(message);
    this.enforceWindowSize();
  }

  private enforceWindowSize(): void {
    let totalTokens = this.window.reduce((sum, m) => sum + m.tokens, 0);

    // Keep removing oldest non-system messages until we fit
    while (totalTokens &gt; this.maxWindowTokens &amp;&amp; this.window.length &gt; 1) {
      const oldest = this.window.find(m => m.role !== 'system');
      if (!oldest) break;

      totalTokens -= oldest.tokens;
      this.window = this.window.filter(m => m.id !== oldest.id);
    }
  }

  getContextMessages(): Message[] {
    return [...this.window];
  }

  getCurrentWindowUsage(): { tokens: number; percentage: number } {
    const tokens = this.window.reduce((sum, m) => sum + m.tokens, 0);
    return {
      tokens,
      percentage: (tokens / this.maxWindowTokens) * 100
    };
  }

  shouldTriggerSummarization(): boolean {
    const usage = this.getCurrentWindowUsage();
    return usage.percentage &gt; 85;
  }
}

Conversation Summarization at Threshold

When reaching 85% capacity, summarize older messages to reclaim tokens.

interface SummarizationConfig {
  threshold_percentage: number;
  model: string;
  keep_recent_messages: number;
}

class ConversationSummarizer {
  async summarizeWindow(
    messages: Message[],
    config: SummarizationConfig
  ): Promise<{ summary: string; tokens_saved: number }> {
    // Keep system prompt and N most recent messages
    const recentMessages = messages.slice(-config.keep_recent_messages);
    const oldMessages = messages.slice(0, messages.length - config.keep_recent_messages);

    if (oldMessages.length === 0) {
      return { summary: '', tokens_saved: 0 };
    }

    // Build conversation text from old messages
    const conversationText = oldMessages
      .map(m => `${m.role}: ${m.content}`)
      .join('\n\n');

    // In production, call LLM API to summarize
    const summaryPrompt = `Summarize this conversation into key points and decisions:\n\n${conversationText}`;

    // This would be an actual LLM call in production
    const summary = `[Summary of ${oldMessages.length} earlier messages in conversation]`;

    const tokensSaved = oldMessages.reduce((sum, m) => sum + m.tokens, 0);

    return { summary, tokens_saved: tokensSaved };
  }

  createSummarizedContext(
    originalMessages: Message[],
    summary: string
  ): Message[] {
    const summaryMessage: Message = {
      id: 'summary-' + Date.now(),
      role: 'system',
      content: `Earlier conversation summary:\n${summary}`,
      tokens: summary.split(' ').length,
      timestamp: new Date()
    };

    return [
      originalMessages[0], // system prompt
      summaryMessage,
      ...originalMessages.slice(-5) // last 5 messages
    ];
  }
}

Map-Reduce for Long Document Q&A

Break documents into chunks, query each chunk independently, then reduce answers.

interface ChunkResult {
  chunk_id: number;
  text: string;
  answer: string;
  relevance_score: number;
}

class MapReduceDocumentQA {
  private chunkSize: number = 2000; // tokens

  splitDocumentIntoChunks(document: string, encoder: any): string[] {
    const tokens = encoder.encode(document);
    const chunks: string[] = [];

    for (let i = 0; i &lt; tokens.length; i += this.chunkSize) {
      const chunkTokens = tokens.slice(i, i + this.chunkSize);
      chunks.push(encoder.decode(chunkTokens));
    }

    return chunks;
  }

  async mapQueryToChunks(
    chunks: string[],
    question: string,
    client: any
  ): Promise<ChunkResult[]> {
    const results: ChunkResult[] = [];

    for (let i = 0; i &lt; chunks.length; i++) {
      const prompt = `Context:\n${chunks[i]}\n\nQuestion: ${question}\n\nAnswer:`;

      // In production, batch these requests
      const response = await client.chat.completions.create({
        model: 'gpt-4o',
        messages: [{ role: 'user', content: prompt }],
        max_tokens: 200
      });

      results.push({
        chunk_id: i,
        text: chunks[i],
        answer: response.choices[0].message.content,
        relevance_score: 0.5 // Would compute actual relevance
      });
    }

    return results;
  }

  reduceAnswers(results: ChunkResult[]): string {
    const topResults = results
      .sort((a, b) => b.relevance_score - a.relevance_score)
      .slice(0, 3);

    const consolidatedAnswer = topResults
      .map(r => r.answer)
      .join('\n\nAlso: ');

    return consolidatedAnswer;
  }
}

Hierarchical Summarization

Multi-level summarization: chunk summaries → section summaries → document summary.

interface SummaryNode {
  level: number;
  content: string;
  tokens: number;
  children?: SummaryNode[];
}

class HierarchicalSummarizer {
  async buildSummaryTree(
    document: string,
    encoder: any,
    client: any
  ): Promise<SummaryNode> {
    // Level 0: Split into chunks
    const chunkSize = 2000;
    const tokens = encoder.encode(document);
    const chunks: string[] = [];

    for (let i = 0; i &lt; tokens.length; i += chunkSize) {
      chunks.push(encoder.decode(tokens.slice(i, i + chunkSize)));
    }

    // Level 1: Summarize each chunk
    const chunkSummaries: SummaryNode[] = await Promise.all(
      chunks.map(async (chunk) => {
        const summary = await this.summarizeChunk(chunk, client);
        return {
          level: 1,
          content: summary,
          tokens: encoder.encode(summary).length,
          children: undefined
        };
      })
    );

    // Level 2: Summarize summaries
    const sectionSummary = await this.summarizeChunk(
      chunkSummaries.map(s => s.content).join('\n'),
      client
    );

    return {
      level: 2,
      content: sectionSummary,
      tokens: encoder.encode(sectionSummary).length,
      children: chunkSummaries
    };
  }

  private async summarizeChunk(chunk: string, client: any): Promise<string> {
    const response = await client.chat.completions.create({
      model: 'gpt-4o',
      messages: [
        {
          role: 'user',
          content: `Summarize this in 1-2 sentences:\n${chunk}`
        }
      ],
      max_tokens: 100
    });

    return response.choices[0].message.content;
  }

  extractSummaryLevel(tree: SummaryNode, level: number): string {
    if (tree.level === level) return tree.content;
    if (!tree.children || tree.children.length === 0) return tree.content;
    return tree.children.map(c => this.extractSummaryLevel(c, level)).join('\n');
  }
}

Lost-in-the-Middle Problem

The "lost-in-the-middle" phenomenon: LLMs pay less attention to information in the middle of context. Place critical info at the start or end.

interface ContextOptimization {
  priority: 'critical' | 'supporting' | 'optional';
  content: string;
}

class ContextPositioningStrategy {
  organizeContextByImportance(
    items: Array<{ content: string; priority: 'critical' | 'supporting' | 'optional' }>
  ): string {
    const critical = items.filter(i => i.priority === 'critical');
    const supporting = items.filter(i => i.priority === 'supporting');
    const optional = items.filter(i => i.priority === 'optional');

    // Arrange as: Critical → Supporting → Optional → Critical recap
    return [
      ...critical.map(i => i.content),
      ...supporting.map(i => i.content),
      ...optional.map(i => i.content),
      `IMPORTANT REMINDERS: ${critical.map(i => i.content).join('; ')}`
    ].join('\n\n');
  }

  addAnchorsToContext(context: string): string {
    // Wrap critical sections with anchors
    return context
      .replace(
        /^(Important:|Critical:|Note:)/m,
        '&lt;critical&gt;$1&lt;/critical&gt;'
      );
  }
}

Context Stuffing Anti-Patterns

Never append raw documents. Structure matters.

class ContextStuffingPrevention {
  // Anti-pattern: just concatenating documents
  badApproach(documents: string[]): string {
    return documents.join('\n');
  }

  // Good approach: structured context with metadata
  goodApproach(
    documents: Array<{ title: string; content: string }>
  ): string {
    return documents
      .map(doc => `## ${doc.title}\n${doc.content}`)
      .join('\n\n---\n\n');
  }

  // Better: use XML-like tags for semantic clarity
  bestApproach(
    documents: Array<{ title: string; content: string }>
  ): string {
    return documents
      .map(
        doc =>
          `&lt;document&gt;\n  &lt;title&gt;${doc.title}&lt;/title&gt;\n  &lt;content&gt;${doc.content}&lt;/content&gt;\n&lt;/document&gt;`
      )
      .join('\n');
  }
}

Token Counting Before Sending

Always count tokens before calling an LLM API to avoid exceeding limits.

interface PreflightCheck {
  will_fit: boolean;
  prompt_tokens: number;
  estimated_response_tokens: number;
  margin: number;
}

class PreflightValidator {
  constructor(private encoder: any, private maxTokens: number = 128000) {}

  validateBeforeSend(
    systemPrompt: string,
    userMessage: string,
    estimatedResponseTokens: number = 2000
  ): PreflightCheck {
    const systemTokens = this.encoder.encode(systemPrompt).length;
    const userTokens = this.encoder.encode(userMessage).length;
    const totalPromptTokens = systemTokens + userTokens;

    const totalTokensNeeded = totalPromptTokens + estimatedResponseTokens;
    const will_fit = totalTokensNeeded &lt;= this.maxTokens;

    return {
      will_fit,
      prompt_tokens: totalPromptTokens,
      estimated_response_tokens: estimatedResponseTokens,
      margin: this.maxTokens - totalTokensNeeded
    };
  }

  truncateIfNeeded(
    text: string,
    maxTokens: number
  ): string {
    const tokens = this.encoder.encode(text);
    if (tokens.length &lt;= maxTokens) return text;

    const truncatedTokens = tokens.slice(0, maxTokens - 50); // Leave buffer
    return this.encoder.decode(truncatedTokens);
  }
}

Retrieval-Augmented Truncation

Only include relevant retrieved documents, not everything.

interface RetrievalResult {
  id: string;
  content: string;
  similarity_score: number;
  token_count: number;
}

class RetrievalAugmentedTruncation {
  selectRelevantDocuments(
    query: string,
    results: RetrievalResult[],
    maxTokens: number
  ): RetrievalResult[] {
    const sorted = results.sort((a, b) => b.similarity_score - a.similarity_score);
    let totalTokens = 0;
    const selected: RetrievalResult[] = [];

    for (const result of sorted) {
      if (totalTokens + result.token_count &lt;= maxTokens) {
        selected.push(result);
        totalTokens += result.token_count;
      } else if (result.similarity_score &gt; 0.85) {
        // Include very relevant docs even if over budget
        selected.push(result);
        totalTokens += result.token_count;
      }
    }

    return selected;
  }

  buildContextFromRetrievals(
    results: RetrievalResult[]
  ): string {
    return results
      .map(r => `&lt;doc id="${r.id}" relevance="${(r.similarity_score * 100).toFixed(0)}%"&gt;\n${r.content}\n&lt;/doc&gt;`)
      .join('\n\n');
  }
}

Checklist

  • Profile your conversation patterns to estimate token growth
  • Choose models with context windows 3x your max expected usage
  • Implement sliding window to drop old messages when exceeding capacity
  • Trigger automatic summarization at 85% window usage
  • Use map-reduce for document Q&A, not full context stuffing
  • Place critical information at the start/end to avoid lost-in-middle
  • Always precount tokens before API calls
  • Structure context with XML/markdown, not raw concatenation
  • Retrieve only relevant documents up to token budget
  • Build a summarization queue to cache summaries for reuse

Conclusion

Context window management is not about working around limits—it's about working within them strategically. Sliding windows handle conversations, hierarchical summarization handles documents, and strategic placement handles the lost-in-middle problem. Master these patterns and you'll ship LLM apps that scale.