sre-expert
Expert-level site reliability engineering, SLOs, incident management, and operational excellence
What this skill does
# Site Reliability Engineering Expert
Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.
## Core Concepts
### SRE Fundamentals
- Service Level Objectives (SLOs)
- Service Level Indicators (SLIs)
- Error budgets
- Toil reduction
- Monitoring and alerting
- Capacity planning
### Reliability Practices
- Incident management
- Post-incident reviews (PIRs)
- On-call rotations
- Chaos engineering
- Disaster recovery
- Change management
### Automation
- Infrastructure as Code
- Configuration management
- Deployment automation
- Self-healing systems
- Runbook automation
- Automated remediation
## SLO/SLI Management
```python
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import List, Dict
import numpy as np
@dataclass
class SLI:
"""Service Level Indicator"""
name: str
description: str
query: str
unit: str # 'percentage', 'milliseconds', etc.
@dataclass
class SLO:
"""Service Level Objective"""
name: str
sli: SLI
target: float
window_days: int
class SLOTracker:
"""Track and manage SLOs"""
def __init__(self):
self.slos: Dict[str, SLO] = {}
self.measurements: Dict[str, List[Dict]] = {}
def define_slo(self, slo: SLO):
"""Define a new SLO"""
self.slos[slo.name] = slo
self.measurements[slo.name] = []
def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
"""Record SLI measurement"""
if slo_name in self.slos:
self.measurements[slo_name].append({
'value': value,
'timestamp': timestamp
})
def calculate_slo_compliance(self, slo_name: str) -> Dict:
"""Calculate SLO compliance"""
slo = self.slos.get(slo_name)
if not slo:
return {}
measurements = self.measurements.get(slo_name, [])
window_start = datetime.now() - timedelta(days=slo.window_days)
recent_measurements = [
m for m in measurements
if m['timestamp'] > window_start
]
if not recent_measurements:
return {'status': 'no_data'}
values = [m['value'] for m in recent_measurements]
actual = np.mean(values)
return {
'slo_name': slo_name,
'target': slo.target,
'actual': actual,
'compliant': actual >= slo.target,
'window_days': slo.window_days,
'sample_count': len(recent_measurements)
}
def calculate_error_budget(self, slo_name: str) -> Dict:
"""Calculate remaining error budget"""
compliance = self.calculate_slo_compliance(slo_name)
if compliance.get('status') == 'no_data':
return {'status': 'no_data'}
target = compliance['target']
actual = compliance['actual']
error_budget_target = 100 - target
errors_actual = 100 - actual
remaining = error_budget_target - errors_actual
remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100
return {
'slo_name': slo_name,
'error_budget_target': error_budget_target,
'errors_actual': errors_actual,
'remaining': remaining,
'remaining_percentage': remaining_pct,
'exhausted': remaining < 0
}
# Example SLOs
def define_standard_slos() -> List[SLO]:
"""Define standard SLOs for a web service"""
return [
SLO(
name="api_availability",
sli=SLI(
name="availability",
description="Percentage of successful requests",
query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100",
unit="percentage"
),
target=99.9,
window_days=30
),
SLO(
name="api_latency",
sli=SLI(
name="latency_p95",
description="95th percentile latency",
query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
unit="seconds"
),
target=0.5, # 500ms
window_days=30
)
]
```
## Incident Management
```python
from enum import Enum
from datetime import datetime
from typing import List, Optional
class Severity(Enum):
SEV1 = "sev1" # Critical
SEV2 = "sev2" # High
SEV3 = "sev3" # Medium
SEV4 = "sev4" # Low
class IncidentStatus(Enum):
INVESTIGATING = "investigating"
IDENTIFIED = "identified"
MONITORING = "monitoring"
RESOLVED = "resolved"
@dataclass
class Incident:
incident_id: str
title: str
severity: Severity
status: IncidentStatus
started_at: datetime
detected_at: datetime
resolved_at: Optional[datetime]
incident_commander: str
responders: List[str]
affected_services: List[str]
timeline: List[Dict]
root_cause: Optional[str] = None
class IncidentManager:
"""Manage incidents following SRE best practices"""
def __init__(self):
self.incidents: Dict[str, Incident] = {}
def create_incident(self, incident: Incident) -> str:
"""Create new incident"""
self.incidents[incident.incident_id] = incident
# Notify on-call
self.notify_oncall(incident)
# Start incident timeline
self.add_timeline_event(
incident.incident_id,
"Incident created",
datetime.now()
)
return incident.incident_id
def update_status(self, incident_id: str, new_status: IncidentStatus,
note: str):
"""Update incident status"""
if incident_id in self.incidents:
incident = self.incidents[incident_id]
incident.status = new_status
self.add_timeline_event(
incident_id,
f"Status changed to {new_status.value}: {note}",
datetime.now()
)
if new_status == IncidentStatus.RESOLVED:
incident.resolved_at = datetime.now()
def add_timeline_event(self, incident_id: str, event: str,
timestamp: datetime):
"""Add event to incident timeline"""
if incident_id in self.incidents:
self.incidents[incident_id].timeline.append({
'timestamp': timestamp,
'event': event
})
def calculate_mttr(self, incident_id: str) -> Optional[float]:
"""Calculate Mean Time To Resolution"""
incident = self.incidents.get(incident_id)
if incident and incident.resolved_at:
duration = incident.resolved_at - incident.detected_at
return duration.total_seconds() / 60 # minutes
return None
def generate_incident_report(self, incident_id: str) -> Dict:
"""Generate incident report"""
incident = self.incidents.get(incident_id)
if not incident:
return {}
return {
'incident_id': incident.incident_id,
'title': incident.title,
'severity': incident.severity.value,
'status': incident.status.value,
'duration_minutes': self.calculate_mttr(incident_id),
'affected_services': incident.affected_services,
'incident_commander': incident.incident_commander,
'responders': incident.responders,
'timeline': incident.timeline,
'root_cause': incident.root_cause
}
def notify_oncall(self, incident: Incident):
"""Notify on-call engineer (integrate with PagerDuty, etc.)"""
# Implementation would integrate with alerting system
pass
```
## Monitoring and Alerting
```python
from prometheus_client import Counter, Histogram, Gauge
import time
Related in devops
github-actions-advanced
IncludedDesign, debug, and harden GitHub Actions CI/CD workflows, including reusable workflows, matrix builds, self-hosted runners, OIDC authentication, caching, environments, secrets, and release automation.
cicd-pipeline-skill
IncludedGenerates CI/CD pipeline configurations for test automation with GitHub Actions, Jenkins, GitLab CI, and Azure DevOps. Includes TestMu AI cloud integration. Use when user mentions "CI/CD", "pipeline", "GitHub Actions", "Jenkins", "GitLab CI". Triggers on: "CI/CD", "pipeline", "GitHub Actions", "Jenkins", "GitLab CI", "Azure DevOps", "automated testing pipeline".
docker-expert
IncludedDocker containerization expert with deep knowledge of multi-stage builds, image optimization, container security, Docker Compose orchestration, and production deployment patterns. Use PROACTIVELY for Dockerfile optimization, container issues, image size problems, security hardening, networking, and orchestration challenges.
terraform-expert
IncludedExpert-level Terraform infrastructure as code, modules, state management, and production best practices
cicd-expert
IncludedExpert-level CI/CD with GitHub Actions, Jenkins, deployment pipelines, and automation
monitoring-expert
IncludedExpert-level monitoring and observability with Prometheus, Grafana, logging, and alerting