Circuit Breaker — Monitoring and Metrics
Core Patterns
State Change Event Monitoring
Log every CLOSED→OPEN, OPEN→HALF_OPEN, and HALF_OPEN→CLOSED transition with structured data. Silent state changes are the leading cause of missed production incidents.
import CircuitBreaker from 'opossum';
interface StateChangeLog {
timestamp: string;
breaker: string;
from: string;
to: string;
failureRate?: number;
}
function attachLogging(breaker: CircuitBreaker): void {
const log = (from: string, to: string) => {
const entry: StateChangeLog = {
timestamp: new Date().toISOString(),
breaker: breaker.name,
from,
to,
};
// Structured log — ingest into Datadog, CloudWatch, or similar
console.log(JSON.stringify(entry));
};
breaker.on('open', () => log('CLOSED', 'OPEN'));
breaker.on('halfOpen', () => log('OPEN', 'HALF_OPEN'));
breaker.on('close', () => log('HALF_OPEN', 'CLOSED'));
// Also log individual request outcomes
breaker.on('success', (_result, latencyMs) => {
console.log(JSON.stringify({ event: 'circuit.success', breaker: breaker.name, latencyMs }));
});
breaker.on('failure', (err) => {
console.log(JSON.stringify({ event: 'circuit.failure', breaker: breaker.name, error: err.message }));
});
breaker.on('reject', () => {
console.log(JSON.stringify({ event: 'circuit.rejected', breaker: breaker.name }));
});
breaker.on('timeout', () => {
console.log(JSON.stringify({ event: 'circuit.timeout', breaker: breaker.name }));
});
breaker.on('fallback', (result) => {
console.log(JSON.stringify({ event: 'circuit.fallback', breaker: breaker.name, result }));
});
}
Manual breaker equivalent using the onStateChange hook from the hand-rolled implementation:
const breaker = new CircuitBreaker({
name: 'payment-service',
failureThreshold: 5,
windowMs: 60_000,
recoveryTimeoutMs: 15_000,
halfOpenProbes: 2,
successThreshold: 2,
onStateChange: (from, to) => {
logger.warn('Circuit state changed', { breaker: 'payment-service', from, to });
if (to === 'OPEN') {
alerting.page('payment-service circuit opened', { severity: 'critical' });
}
},
});
Prometheus Metrics
Define counters and a gauge to track circuit breaker state, request outcomes, and failure rates.
import { Counter, Gauge, Histogram, register } from 'prom-client';
const circuitState = new Gauge({
name: 'circuit_breaker_state',
help: 'Current state of circuit breaker (0=CLOSED, 1=HALF_OPEN, 2=OPEN)',
labelNames: ['breaker', 'service'],
registers: [register],
});
const circuitRequests = new Counter({
name: 'circuit_breaker_requests_total',
help: 'Total requests through the circuit breaker by outcome',
labelNames: ['breaker', 'service', 'outcome'],
// outcome: success | failure | rejected | timeout | fallback
registers: [register],
});
const circuitLatency = new Histogram({
name: 'circuit_breaker_request_duration_ms',
help: 'Latency of successful circuit breaker executions in ms',
labelNames: ['breaker', 'service'],
buckets: [50, 100, 250, 500, 1000, 2500, 5000],
registers: [register],
});
const stateToValue: Record<string, number> = {
CLOSED: 0,
HALF_OPEN: 1,
OPEN: 2,
};
function instrumentBreaker(breaker: CircuitBreaker, service: string): void {
const labels = { breaker: breaker.name, service };
// Set initial state
circuitState.set(labels, 0);
breaker.on('open', () => circuitState.set(labels, 2));
breaker.on('halfOpen', () => circuitState.set(labels, 1));
breaker.on('close', () => circuitState.set(labels, 0));
breaker.on('success', (_r, latency) => {
circuitRequests.inc({ ...labels, outcome: 'success' });
circuitLatency.observe(labels, latency);
});
breaker.on('failure', () => circuitRequests.inc({ ...labels, outcome: 'failure' }));
breaker.on('reject', () => circuitRequests.inc({ ...labels, outcome: 'rejected' }));
breaker.on('timeout', () => circuitRequests.inc({ ...labels, outcome: 'timeout' }));
breaker.on('fallback', () => circuitRequests.inc({ ...labels, outcome: 'fallback' }));
}
// Expose /metrics endpoint (Prometheus scrape target)
import express from 'express';
const app = express();
app.get('/metrics', async (_req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
Example Prometheus scrape output:
circuit_breaker_state{breaker="payment-service",service="payment"} 2
circuit_breaker_requests_total{breaker="payment-service",service="payment",outcome="success"} 1204
circuit_breaker_requests_total{breaker="payment-service",service="payment",outcome="failure"} 87
circuit_breaker_requests_total{breaker="payment-service",service="payment",outcome="rejected"} 312
circuit_breaker_request_duration_ms_bucket{...,le="250"} 1100
Grafana Dashboard JSON
A minimal Grafana dashboard panel configuration for circuit breaker visualization. Import into Grafana via Dashboard > Import > Paste JSON.
{
"title": "Circuit Breakers",
"uid": "circuit-breakers-v1",
"panels": [
{
"id": 1,
"title": "Circuit State (0=CLOSED, 1=HALF_OPEN, 2=OPEN)",
"type": "stat",
"gridPos": { "x": 0, "y": 0, "w": 8, "h": 4 },
"targets": [
{
"expr": "circuit_breaker_state",
"legendFormat": "{{breaker}}"
}
],
"options": {
"colorMode": "background",
"thresholds": {
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "yellow" },
{ "value": 2, "color": "red" }
]
}
}
},
{
"id": 2,
"title": "Request Outcomes (per minute)",
"type": "timeseries",
"gridPos": { "x": 8, "y": 0, "w": 16, "h": 8 },
"targets": [
{
"expr": "rate(circuit_breaker_requests_total[1m])",
"legendFormat": "{{breaker}} / {{outcome}}"
}
]
},
{
"id": 3,
"title": "Failure Rate %",
"type": "timeseries",
"gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
"targets": [
{
"expr": "rate(circuit_breaker_requests_total{outcome='failure'}[5m]) / rate(circuit_breaker_requests_total[5m]) * 100",
"legendFormat": "{{breaker}} failure %"
}
],
"options": {
"thresholds": {
"steps": [
{ "value": 0, "color": "green" },
{ "value": 30, "color": "yellow" },
{ "value": 50, "color": "red" }
]
}
}
},
{
"id": 4,
"title": "p99 Latency (ms)",
"type": "timeseries",
"gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
"targets": [
{
"expr": "histogram_quantile(0.99, rate(circuit_breaker_request_duration_ms_bucket[5m]))",
"legendFormat": "{{breaker}} p99"
}
]
}
]
}
Health Check Endpoint Exposing Circuit Breaker State
Provide an HTTP endpoint that reports each breaker’s state. Use this for readiness probes and monitoring dashboards.
import express from 'express';
import CircuitBreaker from 'opossum';
const breakerRegistry = new Map<string, CircuitBreaker>();
export function registerBreaker(breaker: CircuitBreaker): void {
breakerRegistry.set(breaker.name, breaker);
}
function getBreakerStatus(breaker: CircuitBreaker) {
let state: 'CLOSED' | 'HALF_OPEN' | 'OPEN';
if (breaker.opened) state = 'OPEN';
else if (breaker.halfOpen) state = 'HALF_OPEN';
else state = 'CLOSED';
return {
name: breaker.name,
state,
stats: {
successes: breaker.stats.successes,
failures: breaker.stats.failures,
rejected: breaker.stats.rejected,
timeouts: breaker.stats.timeouts,
fallbacks: breaker.stats.fallbacks,
latencyMean: breaker.stats.latencyMean,
},
};
}
const app = express();
app.get('/health', (_req, res) => {
const breakers = Array.from(breakerRegistry.values()).map(getBreakerStatus);
const anyOpen = breakers.some((b) => b.state === 'OPEN');
res.status(anyOpen ? 503 : 200).json({
status: anyOpen ? 'degraded' : 'healthy',
timestamp: new Date().toISOString(),
breakers,
});
});
// Deep health — returns 200 always but includes state for dashboards
app.get('/health/deep', (_req, res) => {
const breakers = Array.from(breakerRegistry.values()).map(getBreakerStatus);
res.json({ timestamp: new Date().toISOString(), breakers });
});
Alerting Rules for Sustained OPEN State
Prometheus alerting rules. Add to prometheus/rules/circuit-breakers.yml.
groups:
- name: circuit_breaker_alerts
rules:
- alert: CircuitBreakerOpen
expr: circuit_breaker_state == 2
for: 30s
labels:
severity: warning
annotations:
summary: "Circuit breaker {{ $labels.breaker }} is OPEN"
description: >
The {{ $labels.breaker }} circuit breaker has been OPEN for more than 30 seconds.
Service {{ $labels.service }} is likely unavailable. Fallbacks are active.
- alert: CircuitBreakerOpenSustained
expr: circuit_breaker_state == 2
for: 5m
labels:
severity: critical
annotations:
summary: "Circuit breaker {{ $labels.breaker }} sustained OPEN (5m)"
description: >
The {{ $labels.breaker }} circuit breaker has been OPEN for 5+ minutes.
Investigate {{ $labels.service }} service health immediately.
Fallback responses are being served to all users.
- alert: CircuitBreakerHighFailureRate
expr: >
rate(circuit_breaker_requests_total{outcome="failure"}[5m])
/ rate(circuit_breaker_requests_total[5m]) > 0.4
for: 2m
labels:
severity: warning
annotations:
summary: "Circuit breaker {{ $labels.breaker }} failure rate > 40%"
description: >
{{ $labels.breaker }} is failing {{ $value | humanizePercentage }} of requests
over the last 5 minutes. Circuit may open soon.
- alert: CircuitBreakerHighRejectionRate
expr: >
rate(circuit_breaker_requests_total{outcome="rejected"}[5m]) > 10
for: 1m
labels:
severity: warning
annotations:
summary: "Circuit breaker {{ $labels.breaker }} rejecting > 10 req/s"
description: >
{{ $labels.breaker }} is in OPEN state and rejecting requests at
{{ $value | humanize }} req/s. Fallback load may be significant.
Alertmanager routing suggestion — route severity: critical circuit breaker alerts to PagerDuty and severity: warning to Slack:
route:
receiver: slack-default
routes:
- match:
alertname: CircuitBreakerOpenSustained
receiver: pagerduty-oncall
- match_re:
alertname: CircuitBreaker.*
receiver: slack-circuit-breakers