Observability — Metrics, Logs, Traces
Metrics answer "what is wrong." Logs answer "why." Traces answer "where."
When to use
- Metrics: dashboards, alerting, capacity planning — always
- Logs: debugging specific request failures (structured JSON only)
- Traces: distributed systems where a single request spans multiple services
Tradeoffs
- High-cardinality metrics are expensive (one time series per label combination)
- Verbose unstructured logs have high storage cost and are hard to query
- Tracing adds per-request overhead and requires instrumentation across all services
- Go
- Python
var requestCount = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "http_requests_total",
}, []string{"method", "status"})
func HandleRequest(w http.ResponseWriter, r *http.Request) {
ctx, span := tracer.Start(r.Context(), "handle_request")
defer span.End()
user, err := getUser(ctx, r.URL.Query().Get("id"))
if err != nil {
slog.ErrorContext(ctx, "user fetch failed",
"user_id", r.URL.Query().Get("id"), "error", err)
requestCount.WithLabelValues(r.Method, "500").Inc()
http.Error(w, "error", 500)
return
}
requestCount.WithLabelValues(r.Method, "200").Inc()
json.NewEncoder(w).Encode(user)
}
from prometheus_client import Counter
from opentelemetry import trace
import structlog
request_count = Counter("http_requests_total", "Total requests", ["method", "status"])
tracer = trace.get_tracer(__name__)
log = structlog.get_logger()
def handle_request(method: str, user_id: str):
with tracer.start_as_current_span("handle_request"):
user = get_user(user_id)
if not user:
log.error("user_fetch_failed", user_id=user_id)
request_count.labels(method=method, status="500").inc()
return None
request_count.labels(method=method, status="200").inc()
return user
Gotcha: Logs without structure are grep-able but not queryable. Always emit JSON.
log.Printf("user %s failed", id)is a log.{"level":"error","user_id":"id","error":"msg"}is an observable event.