Circuit Breaker / Bulkhead / Retry — Resilience Primitives
Circuit Breaker stops calling a failing dependency. Bulkhead isolates failure domains. Retry handles transient faults.
When to use
- Any synchronous service-to-service call on a high-traffic path
- Downstream dependency with variable latency or error rates
Tradeoffs
- Retry + circuit breaker can amplify load during recovery if not tuned (retry storm)
- Bulkhead adds configuration and thread/goroutine pool management complexity
- Go
- Python
type State int
const (Closed State = iota; Open; HalfOpen)
type CircuitBreaker struct {
state State
failures int
threshold int
resetAt time.Time
timeout time.Duration
}
func (cb *CircuitBreaker) Call(fn func() error) error {
if cb.state == Open {
if time.Now().Before(cb.resetAt) {
return errors.New("circuit open")
}
cb.state = HalfOpen
}
err := fn()
if err != nil {
cb.failures++
if cb.failures >= cb.threshold {
cb.state = Open
cb.resetAt = time.Now().Add(cb.timeout)
}
return err
}
cb.state = Closed
cb.failures = 0
return nil
}
from enum import Enum, auto
from datetime import datetime, timedelta
class State(Enum):
CLOSED = auto()
OPEN = auto()
HALF_OPEN = auto()
class CircuitBreaker:
def __init__(self, threshold: int, timeout_secs: int):
self.state = State.CLOSED
self.failures = 0
self.threshold = threshold
self.reset_at = datetime.min
self.timeout = timedelta(seconds=timeout_secs)
def call(self, fn):
if self.state == State.OPEN:
if datetime.now() < self.reset_at:
raise RuntimeError("circuit open")
self.state = State.HALF_OPEN
try:
result = fn()
self.state = State.CLOSED
self.failures = 0
return result
except Exception:
self.failures += 1
if self.failures >= self.threshold:
self.state = State.OPEN
self.reset_at = datetime.now() + self.timeout
raise
Gotcha: Always add jitter to retry delays. Synchronized retries from N clients after a failure = thundering herd.
sleep(base * 2^attempt + random(0, base))is the pattern.