- Published on
Building Observability From Scratch — Metrics, Logs, and Traces Without the Complexity
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Observability is not monitoring dashboards or log aggregation—it's the ability to understand your system's behavior from the outside, asking arbitrary questions you didn't anticipate. Real observability starts simple: structured JSON logs, Prometheus-scraped metrics, and OpenTelemetry traces. This post builds observability from scratch: collecting metrics with Prometheus, storing logs with Loki in JSON format, tracing with Tempo, and correlating all three using trace IDs. No vendor lock-in, no complexity until you need it.
- Prometheus Metrics Collection
- Structured Logging with Pino
- Loki for Log Aggregation
- OpenTelemetry Traces with Tempo
- Correlating Traces, Logs, and Metrics
- Alert Rules from SLI Metrics
- Grafana Dashboard Setup
- Checklist
- Conclusion
Prometheus Metrics Collection
Install Prometheus and scrape your application:
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
environment: 'prod'
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
rule_files:
- 'rules/*.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'api-service'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
scrape_interval: 10s
- job_name: 'database'
static_configs:
- targets: ['localhost:5432']
scrape_interval: 30s
- job_name: 'redis'
static_configs:
- targets: ['localhost:6379']
scrape_interval: 10s
Instrument your Node.js application:
// src/monitoring/metrics.ts
import { register, Counter, Histogram, Gauge } from 'prom-client';
import express from 'express';
// Request metrics
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status'],
buckets: [0.001, 0.01, 0.1, 0.5, 1, 2, 5],
});
export const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'route', 'status'],
});
// Business metrics
export const ordersTotal = new Counter({
name: 'orders_total',
help: 'Total orders processed',
labelNames: ['status', 'currency'],
});
export const orderValue = new Histogram({
name: 'order_value_usd',
help: 'Order value in USD',
labelNames: ['product_category'],
buckets: [10, 50, 100, 500, 1000, 5000],
});
// Resource metrics
export const dbConnectionPoolSize = new Gauge({
name: 'db_connection_pool_size',
help: 'Number of database connections in pool',
labelNames: ['pool_name'],
});
export const cacheHitRate = new Gauge({
name: 'cache_hit_rate',
help: 'Cache hit rate percentage',
labelNames: ['cache_name'],
});
// Middleware to record request metrics
export function metricsMiddleware(
req: express.Request,
res: express.Response,
next: express.NextFunction
) {
const start = Date.now();
const route = req.route?.path || req.path;
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequestDuration
.labels(req.method, route, res.statusCode.toString())
.observe(duration);
httpRequestsTotal
.labels(req.method, route, res.statusCode.toString())
.inc();
});
next();
}
// Expose metrics endpoint
export function setupMetricsEndpoint(app: express.Application) {
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
}
// Usage in application
export async function processOrder(orderId: string, amount: number) {
try {
ordersTotal.labels('completed', 'USD').inc();
orderValue.labels('electronics').observe(amount);
} catch (error) {
ordersTotal.labels('failed', 'USD').inc();
throw error;
}
}
Define alert rules:
# prometheus/rules/alerts.yml
groups:
- name: api-alerts
interval: 30s
rules:
- alert: HighErrorRate
expr: |
(sum(rate(http_requests_total{status=~"5.."}[5m])) /
sum(rate(http_requests_total[5m]))) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: 'High error rate detected'
description: 'Error rate is {{ $value | humanizePercentage }}'
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: 'High latency detected'
description: 'p99 latency is {{ $value }}s'
- alert: DatabaseConnectionPoolExhausted
expr: db_connection_pool_used / db_connection_pool_size > 0.9
for: 2m
labels:
severity: critical
annotations:
summary: 'Database connection pool near capacity'
- alert: CacheMissRate
expr: (1 - cache_hit_rate) > 0.3
for: 10m
labels:
severity: warning
annotations:
summary: 'High cache miss rate ({{ $value | humanizePercentage }})'
Structured Logging with Pino
Use JSON logging for machine-readable logs:
// src/logging/logger.ts
import pino from 'pino';
export const logger = pino({
level: process.env.LOG_LEVEL || 'info',
formatters: {
level: (label) => {
return { level: label.toUpperCase() };
},
bindings: (bindings) => {
return {
pid: bindings.pid,
hostname: bindings.hostname,
};
},
},
timestamp: pino.stdTimeFunctions.isoTime,
transport: {
target: 'pino-pretty', // Pretty-print in dev, JSON in prod
options: {
colorize: process.env.NODE_ENV !== 'production',
ignore: 'pid,hostname',
},
},
});
// Usage with context
export function createRequestLogger(
req: express.Request,
traceId: string
) {
return logger.child({
traceId,
requestId: req.id,
userId: req.user?.id,
method: req.method,
path: req.path,
userAgent: req.get('user-agent'),
});
}
// Example log output (JSON in prod):
// {
// "level": "INFO",
// "time": "2026-03-15T10:30:00Z",
// "traceId": "a1b2c3d4",
// "requestId": "req-123",
// "userId": "user-456",
// "method": "POST",
// "path": "/api/orders",
// "msg": "Order created successfully",
// "orderId": "order-789",
// "amount": 99.99,
// "processingTimeMs": 245
// }
Log business events and errors:
export async function handleOrderRequest(
req: express.Request,
res: express.Response
) {
const log = createRequestLogger(req, req.traceId);
try {
log.info('Processing order request', {
itemCount: req.body.items.length,
totalAmount: req.body.total,
});
const order = await createOrder(req.body);
log.info('Order created successfully', {
orderId: order.id,
status: order.status,
amount: order.total,
processingTimeMs: Date.now() - req.startTime,
});
res.json(order);
} catch (error) {
log.error('Order creation failed', {
error: error.message,
stack: error.stack,
errorCode: error.code,
processingTimeMs: Date.now() - req.startTime,
});
res.status(500).json({ error: 'Order creation failed' });
}
}
Loki for Log Aggregation
Set up Loki to store and query logs:
# loki/loki-config.yml
auth_enabled: false
ingester:
max_chunk_age: 2h
chunk_idle_period: 3m
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
server:
http_listen_port: 3100
log_level: info
storage_config:
boltdb_shipper:
active_index_directory: /loki/index
shared_store: filesystem
filesystem:
directory: /loki/chunks
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
Query logs in Loki with LogQL:
// observability/log-queries.ts
export const LOG_QUERIES = {
// Find all errors in production
productionErrors: `{environment="production"} | json | level="ERROR"`,
// Find requests for a specific user
userRequests: (userId: string) =>
`{environment="production"} | json | userId="${userId}"`,
// Find slow requests
slowRequests: `{environment="production"} | json | processingTimeMs > 1000`,
// Find errors by service
serviceErrors: (service: string) =>
`{service="${service}"} | json | level="ERROR" | stats count() by error_code`,
// Database query performance
dbSlowQueries: `{environment="production"} | json | query_type="database" | duration_ms > 500`,
// Orders by hour
orderVolume: `{environment="production"} | json | event="order_created" | stats count() by bin(1h)`,
};
// Log sampling for high-volume events
export const logWithSampling = (
logger: pino.Logger,
message: string,
context: any,
samplingRate: number = 0.1 // Log 10% of events
) => {
if (Math.random() < samplingRate) {
logger.info(message, context);
}
};
OpenTelemetry Traces with Tempo
Install and configure OpenTelemetry:
// src/tracing/instrumentation.ts
import {
NodeSDK,
node,
} from '@opentelemetry/auto-instrumentations-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
[SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
environment: process.env.NODE_ENV || 'development',
}),
traceExporter: new OTLPTraceExporter({
url: 'http://tempo:4318/v1/traces',
}),
metricReader: new PeriodicExportingMetricReader({
exporter: new OTLPMetricExporter({
url: 'http://tempo:4318/v1/metrics',
}),
}),
instrumentations: [getNodeAutoInstrumentations()],
});
sdk.start();
Create custom spans for business operations:
// src/tracing/spans.ts
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
const tracer = trace.getTracer('api-service');
export async function createOrderWithTracing(orderData: any) {
const span = tracer.startSpan('create_order', {
attributes: {
'order.item_count': orderData.items.length,
'order.total_amount': orderData.total,
'order.customer_id': orderData.customerId,
},
});
return context.with(trace.setSpan(context.active(), span), async () => {
try {
// Validate input
const validateSpan = tracer.startSpan('validate_order');
try {
await validateOrder(orderData);
validateSpan.end();
} catch (error) {
validateSpan.recordException(error);
validateSpan.setStatus({ code: SpanStatusCode.ERROR });
validateSpan.end();
throw error;
}
// Process payment
const paymentSpan = tracer.startSpan('process_payment', {
attributes: {
'payment.method': orderData.paymentMethod,
'payment.amount': orderData.total,
},
});
try {
const payment = await processPayment(orderData);
paymentSpan.setAttributes({
'payment.transaction_id': payment.transactionId,
'payment.processing_time_ms': payment.processingTime,
});
paymentSpan.end();
} catch (error) {
paymentSpan.recordException(error);
paymentSpan.setStatus({ code: SpanStatusCode.ERROR });
paymentSpan.end();
throw error;
}
// Save to database
const dbSpan = tracer.startSpan('save_order_to_db', {
attributes: {
'db.operation': 'insert',
'db.table': 'orders',
},
});
try {
const order = await saveOrder(orderData);
dbSpan.setAttributes({
'db.rows_affected': 1,
});
dbSpan.end();
span.end();
return order;
} catch (error) {
dbSpan.recordException(error);
dbSpan.setStatus({ code: SpanStatusCode.ERROR });
dbSpan.end();
span.recordException(error);
span.setStatus({ code: SpanStatusCode.ERROR });
span.end();
throw error;
}
} catch (error) {
span.recordException(error);
span.setStatus({ code: SpanStatusCode.ERROR });
span.end();
throw error;
}
});
}
Correlating Traces, Logs, and Metrics
Use trace IDs to correlate all signals:
// src/middleware/trace-correlation.ts
import { trace, context } from '@opentelemetry/api';
import express from 'express';
import { v4 as uuidv4 } from 'uuid';
export function traceCorrelationMiddleware(
req: express.Request,
res: express.Response,
next: express.NextFunction
) {
// Get or generate trace ID
const traceId =
req.get('x-trace-id') || req.get('traceparent')?.split('-')[1] || uuidv4();
// Create span context
const span = trace.getActiveSpan() || trace.getTracer('express').startSpan('http');
const spanContext = span.spanContext();
// Attach to request for logging
req.traceId = traceId;
req.spanId = spanContext.spanId;
// Set response headers
res.set('x-trace-id', traceId);
res.set('x-span-id', spanContext.spanId);
// All logs in this request will include the trace ID
const childContext = trace.setSpan(context.active(), span);
context.with(childContext, () => {
next();
});
}
// Enhance logger with trace context
declare global {
namespace Express {
interface Request {
traceId: string;
spanId: string;
}
}
}
export function enhanceLogContext(logger: pino.Logger, req: express.Request) {
return logger.child({
traceId: req.traceId,
spanId: req.spanId,
});
}
Alert Rules from SLI Metrics
Define alerts based on SLI (Service Level Indicators):
# prometheus/rules/sli-alerts.yml
groups:
- name: sli-alerts
interval: 1m
rules:
# SLI: 99% of requests complete in under 500ms
- alert: SLILatencyViolation
expr: |
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
for: 5m
labels:
severity: critical
sli: 'latency'
annotations:
summary: 'SLI violation: Latency p99 > 500ms'
# SLI: 99.5% of requests succeed (< 0.5% error rate)
- alert: SLIAvailabilityViolation
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) /
sum(rate(http_requests_total[5m])) > 0.005
for: 5m
labels:
severity: critical
sli: 'availability'
annotations:
summary: 'SLI violation: Error rate > 0.5%'
# SLI: 99.9% of database queries complete in under 100ms
- alert: SLIDatabaseLatencyViolation
expr: |
histogram_quantile(
0.999,
rate(db_query_duration_seconds_bucket[5m])
) > 0.1
for: 5m
labels:
severity: warning
sli: 'database-latency'
annotations:
summary: 'Database query latency p999 > 100ms'
Grafana Dashboard Setup
Create a Grafana dashboard that ties everything together:
{
"dashboard": {
"title": "API Service Health",
"panels": [
{
"id": 1,
"title": "Request Rate",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (method)"
}
],
"type": "graph"
},
{
"id": 2,
"title": "Error Rate",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"
}
]
},
{
"id": 3,
"title": "Latency p99",
"targets": [
{
"expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"
}
]
},
{
"id": 4,
"title": "Recent Errors",
"targets": [
{
"expr": "{environment='production'} | json | level='ERROR'"
}
],
"type": "logs"
},
{
"id": 5,
"title": "Trace Analysis",
"type": "traces",
"description": "Click on a trace to view full request flow"
}
]
}
}
Checklist
- Install Prometheus and configure scrape targets
- Instrument application with prom-client
- Set up structured JSON logging with Pino
- Deploy Loki for log aggregation
- Install and configure OpenTelemetry SDK
- Create custom spans for business operations
- Add trace ID to logs and metrics
- Define SLI-based alerts
- Create Grafana dashboards
- Set up correlation between traces, logs, and metrics
Conclusion
Start with the three pillars: Prometheus for metrics (performance), Loki for logs (debugging), Tempo for traces (understanding flow). Correlate them with trace IDs. Define alerts from SLI metrics, not arbitrary thresholds. Build dashboards that answer the questions you ask at 2 AM. When you can drill from a Grafana alert to the relevant logs and traces, you have observability. When you understand your system's behavior from the outside, you're unstoppable.