- Published on
LLM Cost Optimization — Cutting Your AI Bill by 80% Without Degrading Quality
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
LLM costs compound quickly. A single poorly optimized feature can cost thousands monthly. This guide shows production techniques used by scaled AI applications to cut costs by 80% without sacrificing quality.
- Token Counting Before Requests
- Semantic Caching With Redis
- Prompt Compression
- Model Routing Strategy
- Request Batching
- Cost Attribution Per Feature
- Budget Alerts With Hard Cutoffs
- Checklist
- Conclusion
Token Counting Before Requests
Always estimate tokens before calling an LLM. Tiktoken helps you catch expensive requests early.
import { encoding_for_model } from 'js-tiktoken';
class TokenCounter {
private enc = encoding_for_model('gpt-4');
estimateCost(messages: Array<{ role: string; content: string }>, model: string): {
estimatedTokens: number;
estimatedCost: number;
} {
const modelPricing: Record<string, { input: number; output: number }> = {
'gpt-4': { input: 0.00003, output: 0.00006 },
'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
};
let totalTokens = 0;
for (const msg of messages) {
totalTokens += this.enc.encode(msg.content).length + 4;
}
const pricing = modelPricing[model] || modelPricing['gpt-3.5-turbo'];
const estimatedCost = (totalTokens * pricing.input) + (totalTokens * 1.5 * pricing.output);
return {
estimatedTokens: totalTokens,
estimatedCost,
};
}
shouldProceed(estimatedCost: number, budget: number): boolean {
return estimatedCost < budget;
}
}
// Usage
const counter = new TokenCounter();
const estimation = counter.estimateCost(
[{ role: 'user', content: 'Analyze this 10K filing...' }],
'gpt-4'
);
if (!counter.shouldProceed(estimation.estimatedCost, 0.05)) {
throw new Error(`Request exceeds budget: $${estimation.estimatedCost}`);
}
Semantic Caching With Redis
Cache at the semantic level using embeddings, not just string matching. Identical requests in different words should hit the same cache.
import Redis from 'ioredis';
import { OpenAI } from 'openai';
class SemanticCache {
private redis: Redis.Redis;
private openai: OpenAI;
private similarityThreshold = 0.95;
constructor(redisUrl: string) {
this.redis = new Redis(redisUrl);
this.openai = new OpenAI();
}
async getEmbedding(text: string): Promise<number[]> {
const response = await this.openai.embeddings.create({
model: 'text-embedding-3-small',
input: text,
});
return response.data[0].embedding;
}
cosineSimilarity(vecA: number[], vecB: number[]): number {
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
async get(prompt: string): Promise<string | null> {
const embedding = await this.getEmbedding(prompt);
const cached = await this.redis.get(`cache:${JSON.stringify(embedding)}`);
if (cached) {
const stored = JSON.parse(cached);
const similarity = this.cosineSimilarity(embedding, stored.embedding);
if (similarity >= this.similarityThreshold) {
return stored.response;
}
}
return null;
}
async set(prompt: string, response: string): Promise<void> {
const embedding = await this.getEmbedding(prompt);
const ttl = 86400 * 30; // 30 days
await this.redis.setex(
`cache:${JSON.stringify(embedding)}`,
ttl,
JSON.stringify({ embedding, response })
);
}
}
// Usage
const cache = new SemanticCache(process.env.REDIS_URL!);
const cachedResult = await cache.get('What is machine learning?');
if (cachedResult) {
console.log('Cache hit:', cachedResult);
} else {
const response = 'Fresh response from LLM...';
await cache.set('What is machine learning?', response);
}
Prompt Compression
Reduce token count without losing context using abstractive summarization or token pruning.
class PromptCompressor {
async compress(text: string, target: 'aggressive' | 'balanced' = 'balanced'): Promise<string> {
const strategies: Record<string, (t: string) => string> = {
aggressive: (t) => t.split(' ').filter((_, i) => i % 3 === 0).join(' '),
balanced: (t) => t.replace(/[aeiou]{2,}/gi, 'a').replace(/\s+/g, ' '),
};
return strategies[target](text);
}
removeBoilerplate(text: string): string {
const boilerplatePatterns = [
/disclaimer:.*?(?=\n\n|\Z)/is,
/confidential.*?(?=\n\n|\Z)/is,
/legal.*?(?=\n\n|\Z)/is,
];
let result = text;
for (const pattern of boilerplatePatterns) {
result = result.replace(pattern, '');
}
return result.trim();
}
summarizeLines(text: string, keepRatio: number = 0.7): string {
const lines = text.split('\n');
const keepCount = Math.ceil(lines.length * keepRatio);
return lines.slice(0, keepCount).join('\n');
}
}
const compressor = new PromptCompressor();
const original = 'Very long document content...';
const compressed = await compressor.compress(original, 'balanced');
const cleaned = compressor.removeBoilerplate(compressed);
Model Routing Strategy
Use cheaper models first, escalate to expensive models only when confidence is low.
interface RoutingDecision {
model: string;
reason: string;
costEstimate: number;
}
class ModelRouter {
private models = [
{ name: 'gpt-3.5-turbo', cost: 0.0000005, tier: 'budget' },
{ name: 'gpt-4-turbo', cost: 0.00001, tier: 'standard' },
{ name: 'gpt-4', cost: 0.00003, tier: 'premium' },
];
async route(prompt: string, confidenceThreshold: number = 0.8): Promise<RoutingDecision> {
// Start with cheapest model
let selectedModel = this.models[0];
// Estimate complexity from prompt length and token count
const promptTokens = Math.ceil(prompt.length / 4);
const complexity = promptTokens > 1000 ? 'high' : 'low';
if (complexity === 'high' && confidenceThreshold > 0.9) {
selectedModel = this.models[2]; // Use premium model for complex tasks
} else if (complexity === 'high') {
selectedModel = this.models[1]; // Use standard for moderately complex
}
return {
model: selectedModel.name,
reason: `Routed to ${selectedModel.tier} based on complexity: ${complexity}`,
costEstimate: promptTokens * selectedModel.cost,
};
}
}
const router = new ModelRouter();
const decision = await router.route('Simple question?', 0.7);
console.log(`Using ${decision.model}: ${decision.reason}`);
Request Batching
Batch multiple requests to reduce API overhead and improve throughput.
class RequestBatcher {
private queue: Array<{ id: string; prompt: string; resolve: Function; reject: Function }> = [];
private batchSize = 10;
private flushInterval = 100; // ms
constructor() {
setInterval(() => this.flush(), this.flushInterval);
}
async add(id: string, prompt: string): Promise<string> {
return new Promise((resolve, reject) => {
this.queue.push({ id, prompt, resolve, reject });
if (this.queue.length >= this.batchSize) {
this.flush();
}
});
}
private async flush(): Promise<void> {
if (this.queue.length === 0) return;
const batch = this.queue.splice(0, this.batchSize);
const prompts = batch.map((item) => item.prompt);
try {
// Process batch in parallel at LLM endpoint
const responses = await Promise.all(
prompts.map((p) =>
fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
body: JSON.stringify({ messages: [{ role: 'user', content: p }] }),
}).then((r) => r.json())
)
);
batch.forEach((item, i) => {
item.resolve(responses[i].choices[0].message.content);
});
} catch (error) {
batch.forEach((item) => item.reject(error));
}
}
}
const batcher = new RequestBatcher();
const result1 = batcher.add('req1', 'First prompt');
const result2 = batcher.add('req2', 'Second prompt');
Cost Attribution Per Feature
Track which features drive AI costs for better ROI analysis.
class CostAttributor {
private costLog: Array<{
feature: string;
tokens: number;
cost: number;
timestamp: Date;
}> = [];
logRequest(feature: string, inputTokens: number, outputTokens: number, model: string): void {
const pricing: Record<string, { input: number; output: number }> = {
'gpt-4': { input: 0.00003, output: 0.00006 },
'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
};
const p = pricing[model] || pricing['gpt-3.5-turbo'];
const cost = inputTokens * p.input + outputTokens * p.output;
this.costLog.push({ feature, tokens: inputTokens + outputTokens, cost, timestamp: new Date() });
}
getFeatureCosts(timeframe: 'day' | 'week' | 'month' = 'day'): Record<string, number> {
const now = new Date();
const ms = { day: 86400000, week: 604800000, month: 2592000000 }[timeframe];
const cutoff = new Date(now.getTime() - ms);
const costs: Record<string, number> = {};
for (const log of this.costLog) {
if (log.timestamp >= cutoff) {
costs[log.feature] = (costs[log.feature] || 0) + log.cost;
}
}
return costs;
}
getHighestCostFeatures(limit: number = 5): Array<{ feature: string; cost: number }> {
const costs = this.getFeatureCosts('month');
return Object.entries(costs)
.map(([feature, cost]) => ({ feature, cost }))
.sort((a, b) => b.cost - a.cost)
.slice(0, limit);
}
}
const attributor = new CostAttributor();
attributor.logRequest('search', 150, 200, 'gpt-3.5-turbo');
attributor.logRequest('summarize', 2000, 500, 'gpt-4');
console.log('Top cost features:', attributor.getHighestCostFeatures(3));
Budget Alerts With Hard Cutoffs
Enforce spending limits in real-time to prevent budget overruns.
class BudgetEnforcer {
private spentToday: Record<string, number> = {};
private dailyBudgets: Record<string, number>;
constructor(budgets: Record<string, number>) {
this.dailyBudgets = budgets;
this.resetDaily();
}
private resetDaily(): void {
const now = new Date();
const key = now.toISOString().split('T')[0];
const lastKey = localStorage.getItem('budget_key');
if (lastKey !== key) {
this.spentToday = {};
localStorage.setItem('budget_key', key);
}
}
canProceed(feature: string, estimatedCost: number): { allowed: boolean; reason: string } {
this.resetDaily();
const currentSpend = this.spentToday[feature] || 0;
const featureBudget = this.dailyBudgets[feature] || 100;
if (currentSpend + estimatedCost > featureBudget) {
return {
allowed: false,
reason: `Feature ${feature} would exceed daily budget. Current: $${currentSpend}, Limit: $${featureBudget}`,
};
}
return { allowed: true, reason: 'Within budget' };
}
recordSpend(feature: string, amount: number): void {
this.spentToday[feature] = (this.spentToday[feature] || 0) + amount;
}
getSpendStatus(): Record<string, { spent: number; budget: number; utilization: number }> {
return Object.entries(this.dailyBudgets).reduce(
(acc, [feature, budget]) => {
const spent = this.spentToday[feature] || 0;
acc[feature] = { spent, budget, utilization: spent / budget };
return acc;
},
{} as Record<string, { spent: number; budget: number; utilization: number }>
);
}
}
const enforcer = new BudgetEnforcer({ search: 10, summarize: 5, classify: 2 });
const check = enforcer.canProceed('search', 0.50);
if (check.allowed) {
enforcer.recordSpend('search', 0.50);
}
Checklist
- Implement token counting before every API call
- Set up semantic caching with embedding-based retrieval
- Remove boilerplate and compress prompts aggressively
- Route requests to cheapest models first, escalate conditionally
- Batch requests to amortize API costs
- Track costs per feature for ROI analysis
- Enforce hard budget cutoffs with real-time monitoring
- Review top-cost features weekly and optimize them
Conclusion
LLM cost optimization is a continuous process. Start with token counting and semantic caching for immediate savings, then implement model routing and budget controls for scale. Most teams can achieve 60-80% cost reductions without quality loss by combining these techniques.