{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-sre-platform",
    "name": "SRE & Incident Management Platform",
    "source": "tencent",
    "type": "skill",
    "category": "其他",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-sre-platform",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-sre-platform",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-sre-platform",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-sre-platform",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-sre-platform"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-sre-platform",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "SRE & Incident Management Platform",
        "body": "Complete Site Reliability Engineering system — from SLO definition through incident response, chaos engineering, and operational excellence. Zero dependencies."
      },
      {
        "title": "Phase 1: Reliability Assessment",
        "body": "Before building anything, assess where you are."
      },
      {
        "title": "Service Catalog Entry",
        "body": "service:\n  name: \"\"\n  tier: \"\"  # critical | important | standard | experimental\n  owner_team: \"\"\n  oncall_rotation: \"\"\n  dependencies:\n    upstream: []    # services we call\n    downstream: []  # services that call us\n  data_classification: \"\"  # public | internal | confidential | restricted\n  deployment_frequency: \"\"  # daily | weekly | biweekly | monthly\n  architecture: \"\"  # monolith | microservice | serverless | hybrid\n  language: \"\"\n  infra: \"\"  # k8s | ECS | Lambda | VM | bare-metal\n  traffic_pattern: \"\"  # steady | diurnal | spiky | seasonal\n  peak_rps: 0\n  storage_gb: 0\n  monthly_cost_usd: 0"
      },
      {
        "title": "Maturity Assessment (Score 1-5 per dimension)",
        "body": "Dimension1 (Ad-hoc)3 (Defined)5 (Optimized)ScoreSLOsNo SLOs definedSLOs exist, reviewed quarterlyData-driven SLOs, auto error budgetsMonitoringBasic health checksGolden signals + dashboardsFull observability, anomaly detectionIncident ResponseNo runbooks, hero cultureDocumented process, postmortemsAutomated detection, structured ICSAutomationManual deploymentsCI/CD pipeline, some automationSelf-healing, auto-scaling, GitOpsChaos EngineeringNo testingBasic failure injectionContinuous chaos in productionCapacity PlanningReactive scalingQuarterly forecastingPredictive auto-scalingToil Management>50% toilToil tracked, reduction plans<25% toil, systematic eliminationOn-Call HealthBurnout, 24/7 individualsRotation exists, escalation pathsBalanced load, <2 pages/shift\n\nScore interpretation:\n\n8-16: Firefighting mode — start with SLOs + incident process\n17-24: Foundation built — add chaos engineering + toil reduction\n25-32: Maturing — optimize error budgets + capacity planning\n33-40: Advanced — focus on predictive reliability + culture"
      },
      {
        "title": "SLI Selection by Service Type",
        "body": "Service TypePrimary SLISecondary SLIsAPI/BackendRequest success rateLatency p50/p95/p99, throughputFrontend/WebPage load (LCP)FID/INP, CLS, error rateData PipelineFreshnessCorrectness, completeness, throughputStorageDurabilityAvailability, latencyStreamingProcessing latencyThroughput, ordering, data loss rateBatch JobSuccess rateDuration, SLA complianceML ModelPrediction latencyAccuracy drift, feature freshness"
      },
      {
        "title": "SLI Specification Template",
        "body": "sli:\n  name: \"request_success_rate\"\n  description: \"Proportion of valid requests served successfully\"\n  type: \"availability\"  # availability | latency | quality | freshness\n  measurement:\n    good_events: \"HTTP responses with status < 500\"\n    total_events: \"All HTTP requests excluding health checks\"\n    source: \"load balancer access logs\"\n    aggregation: \"sum(good) / sum(total) over rolling 28-day window\"\n  exclusions:\n    - \"Health check endpoints (/healthz, /readyz)\"\n    - \"Synthetic monitoring traffic\"\n    - \"Requests from blocked IPs\"\n    - \"4xx responses (client errors)\""
      },
      {
        "title": "SLO Target Selection Guide",
        "body": "NinesUptime %Downtime/monthAppropriate for2 nines99%7h 18mInternal tools, dev environments2.599.5%3h 39mNon-critical services, backoffice3 nines99.9%43m 50sStandard production services3.599.95%21m 55sImportant customer-facing services4 nines99.99%4m 23sCritical services, payments, auth5 nines99.999%26sLife-safety, financial clearing\n\nRules for setting targets:\n\nStart lower than you think — you can always tighten\nSLO < SLA (always have buffer — typically 0.1-0.5% margin)\nInternal SLO < External SLO (catch problems before customers do)\nEach nine costs ~10x more to achieve\nIf you can't measure it, you can't SLO it"
      },
      {
        "title": "SLO Document Template",
        "body": "slo:\n  service: \"\"\n  sli: \"\"\n  target: 99.9  # percentage\n  window: \"28d\"  # rolling window\n  error_budget: 0.1  # 100% - target\n  error_budget_minutes: 40  # per 28-day window\n  \n  burn_rate_alerts:\n    - name: \"fast_burn\"\n      burn_rate: 14.4  # exhausts budget in 2 hours\n      short_window: \"5m\"\n      long_window: \"1h\"\n      severity: \"page\"\n    - name: \"medium_burn\"\n      burn_rate: 6.0   # exhausts budget in ~5 hours\n      short_window: \"30m\"\n      long_window: \"6h\"\n      severity: \"page\"\n    - name: \"slow_burn\"\n      burn_rate: 1.0   # exhausts budget in 28 days\n      short_window: \"6h\"\n      long_window: \"3d\"\n      severity: \"ticket\"\n  \n  review_cadence: \"monthly\"\n  owner: \"\"\n  stakeholders: []\n  \n  escalation_when_budget_exhausted:\n    - \"Halt non-critical deployments\"\n    - \"Redirect engineering to reliability work\"\n    - \"Escalate to VP Engineering if no improvement in 48h\""
      },
      {
        "title": "Error Budget Policy",
        "body": "error_budget_policy:\n  service: \"\"\n  \n  budget_states:\n    healthy:\n      condition: \"remaining_budget > 50%\"\n      actions:\n        - \"Normal development velocity\"\n        - \"Feature work prioritized\"\n        - \"Chaos experiments allowed\"\n    \n    warning:\n      condition: \"remaining_budget 25-50%\"\n      actions:\n        - \"Increase monitoring scrutiny\"\n        - \"Review recent changes for risk\"\n        - \"Limit risky deployments to business hours\"\n        - \"No chaos experiments\"\n    \n    critical:\n      condition: \"remaining_budget 0-25%\"\n      actions:\n        - \"Feature freeze — reliability work only\"\n        - \"All deployments require SRE approval\"\n        - \"Mandatory rollback plan for every change\"\n        - \"Daily error budget review\"\n    \n    exhausted:\n      condition: \"remaining_budget <= 0\"\n      actions:\n        - \"Complete deployment freeze\"\n        - \"All engineering redirected to reliability\"\n        - \"VP Engineering notified\"\n        - \"Postmortem required for budget exhaustion\"\n        - \"Freeze maintained until budget recovers to 10%\"\n  \n  exceptions:\n    - \"Security patches always allowed\"\n    - \"Regulatory compliance changes always allowed\"\n    - \"Data loss prevention always allowed\"\n  \n  reset: \"Rolling 28-day window (no manual resets)\""
      },
      {
        "title": "Burn Rate Calculation",
        "body": "Burn rate = (error rate observed) / (error rate allowed by SLO)\n\nExample:\n- SLO: 99.9% (error budget = 0.1%)\n- Current error rate: 0.5%\n- Burn rate = 0.5% / 0.1% = 5x\n\nAt 5x burn rate → budget exhausted in 28d / 5 = 5.6 days"
      },
      {
        "title": "Error Budget Dashboard",
        "body": "Track weekly:\n\nMetricCurrentTrendStatusBudget remaining (%)↑↓→🟢🟡🔴Budget consumed this weekBurn rate (1h / 6h / 24h)Incidents consuming budgetTop error contributorProjected exhaustion date"
      },
      {
        "title": "Four Golden Signals",
        "body": "SignalWhat to MeasureAlert WhenLatencyp50, p95, p99 response timep99 > 2x baseline for 5 minTrafficRequests/sec, concurrent users>30% drop (indicates upstream issue) OR >50% spikeErrors5xx rate, timeout rate, exception rateError rate > SLO burn rate thresholdSaturationCPU, memory, disk, connections, queue depth>80% sustained for 10 min"
      },
      {
        "title": "USE Method (Infrastructure)",
        "body": "For every resource, track:\n\nUtilization: % of capacity used (0-100%)\nSaturation: queue depth / wait time (0 = no waiting)\nErrors: error count / error rate"
      },
      {
        "title": "RED Method (Services)",
        "body": "For every service, track:\n\nRate: requests per second\nErrors: failed requests per second\nDuration: latency distribution"
      },
      {
        "title": "Alert Design Rules",
        "body": "Every alert must have a runbook link — no exceptions\nEvery alert must be actionable — if you can't act on it, delete it\nSymptoms over causes — alert on \"users can't check out\" not \"database CPU high\"\nMulti-window, multi-burn-rate — avoid single-threshold alerts\nPage only for customer impact — everything else is a ticket\nAlert fatigue = death — review alert volume monthly; target <5 pages/week per service"
      },
      {
        "title": "Alert Severity Guide",
        "body": "SeverityResponse TimeNotificationExamplesP0/Page<5 minPagerDuty + phoneSLO burn rate critical, data loss, security breachP1/Urgent<30 minSlack + PagerDutyDegraded service, elevated errors, capacity warningP2/TicketNext business dayTicket auto-createdSlow burn, non-critical component downP3/LogWeekly reviewDashboard onlyInformational, trend detection"
      },
      {
        "title": "Structured Log Standard",
        "body": "{\n  \"timestamp\": \"2026-02-17T11:24:00.000Z\",\n  \"level\": \"error\",\n  \"service\": \"payment-api\",\n  \"trace_id\": \"abc123\",\n  \"span_id\": \"def456\",\n  \"message\": \"Payment processing failed\",\n  \"error_type\": \"TimeoutException\",\n  \"error_message\": \"Gateway timeout after 30s\",\n  \"http_method\": \"POST\",\n  \"http_path\": \"/api/v1/payments\",\n  \"http_status\": 504,\n  \"duration_ms\": 30012,\n  \"customer_id\": \"cust_xxx\",\n  \"payment_id\": \"pay_yyy\",\n  \"amount_cents\": 4999,\n  \"retry_count\": 2,\n  \"environment\": \"production\",\n  \"host\": \"payment-api-7b4d9-xk2p1\",\n  \"region\": \"us-east-1\"\n}"
      },
      {
        "title": "Severity Classification Matrix",
        "body": "Impact: 1 UserImpact: <25% UsersImpact: >25% UsersImpact: All UsersCore function downSEV3SEV2SEV1SEV1Degraded performanceSEV4SEV3SEV2SEV1Non-core feature downSEV4SEV3SEV3SEV2Cosmetic/minorSEV4SEV4SEV3SEV3\n\nAuto-escalation triggers:\n\nAny data loss → SEV1 minimum\nSecurity breach with PII → SEV1\nRevenue-impacting → SEV1 or SEV2\nSLA breach imminent → auto-escalate one level"
      },
      {
        "title": "Incident Command System (ICS)",
        "body": "RoleResponsibilityAssignedIncident Commander (IC)Owns resolution, makes decisions, manages timelineCommunications LeadStatus updates, stakeholder comms, customer-facingOperations LeadHands-on-keyboard, executing fixesSubject Matter ExpertDeep knowledge of affected systemScribeDocumenting timeline, actions, decisions\n\nIC Rules:\n\nIC does NOT debug — IC coordinates\nIC makes final decisions when team disagrees\nIC can escalate severity at any time\nIC owns handoff if rotation changes\nIC calls end-of-incident"
      },
      {
        "title": "Incident Response Workflow",
        "body": "DETECT → TRIAGE → RESPOND → MITIGATE → RESOLVE → REVIEW\n\nStep 1: DETECT (0-5 min)\n├── Alert fires OR user report received\n├── On-call acknowledges within SLA\n└── Quick assessment: is this real? What severity?\n\nStep 2: TRIAGE (5-15 min)\n├── Classify severity using matrix above\n├── Assign IC and roles\n├── Open incident channel (#inc-YYYY-MM-DD-title)\n├── Post initial status update\n└── Start timeline document\n\nStep 3: RESPOND (15 min - ongoing)\n├── IC briefs team: \"Here's what we know, here's what we don't\"\n├── Operations Lead begins investigation\n├── Check: recent deployments? Config changes? Dependency issues?\n├── Parallel investigation tracks if needed\n└── 15-minute check-ins for SEV1, 30-min for SEV2\n\nStep 4: MITIGATE (ASAP)\n├── Priority: STOP THE BLEEDING\n├── Options (fastest first):\n│   ├── Rollback last deployment\n│   ├── Feature flag disable\n│   ├── Traffic shift / failover\n│   ├── Scale up / circuit breaker\n│   └── Manual data fix\n├── Mitigated ≠ Resolved — temporary fix is OK\n└── Update status: \"Impact mitigated, root cause investigation ongoing\"\n\nStep 5: RESOLVE\n├── Root cause identified and fixed\n├── Verification: SLIs back to normal for 30+ minutes\n├── All-clear communicated\n└── IC declares incident resolved\n\nStep 6: REVIEW (within 5 business days)\n├── Blameless postmortem written\n├── Action items assigned with owners and deadlines\n├── Postmortem review meeting\n└── Action items tracked to completion"
      },
      {
        "title": "Communication Templates",
        "body": "Initial notification (internal):\n\n🔴 INCIDENT: [Title]\nSeverity: SEV[X]\nImpact: [Who/what is affected]\nStatus: Investigating\nIC: [Name]\nChannel: #inc-[date]-[slug]\nNext update: [time]\n\nCustomer-facing status:\n\n[Service] - Investigating increased error rates\n\nWe are currently investigating reports of [symptom]. \nSome users may experience [user-visible impact].\nOur team is actively working on a resolution.\nWe will provide an update within [time].\n\nResolution notification:\n\n✅ RESOLVED: [Title]\nDuration: [X hours Y minutes]\nImpact: [Summary]\nRoot cause: [One sentence]\nPostmortem: [Link] (within 5 business days)"
      },
      {
        "title": "Blameless Postmortem Template",
        "body": "postmortem:\n  title: \"\"\n  date: \"\"\n  severity: \"\"  # SEV1-4\n  duration: \"\"  # total incident duration\n  authors: []\n  reviewers: []\n  status: \"draft\"  # draft | in-review | final\n  \n  summary: |\n    One paragraph: what happened, what was the impact, how was it resolved.\n  \n  impact:\n    users_affected: 0\n    duration_minutes: 0\n    revenue_impact_usd: 0\n    slo_budget_consumed_pct: 0\n    data_loss: false\n    customer_tickets: 0\n  \n  timeline:\n    - time: \"\"\n      event: \"\"\n      # Chronological, every significant event\n      # Include detection time, escalation, mitigation attempts\n  \n  root_cause: |\n    Technical explanation of WHY it happened.\n    Go deep — surface causes are not root causes.\n  \n  contributing_factors:\n    - \"\"  # What made it worse or delayed resolution?\n  \n  detection:\n    how_detected: \"\"  # alert | user report | manual check\n    time_to_detect_minutes: 0\n    could_have_detected_sooner: \"\"\n  \n  resolution:\n    how_resolved: \"\"\n    time_to_mitigate_minutes: 0\n    time_to_resolve_minutes: 0\n  \n  what_went_well:\n    - \"\"  # Explicitly call out what worked\n  \n  what_went_wrong:\n    - \"\"\n  \n  where_we_got_lucky:\n    - \"\"  # Things that could have made it worse\n  \n  action_items:\n    - id: \"AI-001\"\n      type: \"\"  # prevent | detect | mitigate | process\n      description: \"\"\n      owner: \"\"\n      priority: \"\"  # P0 | P1 | P2\n      deadline: \"\"\n      status: \"open\"  # open | in-progress | done\n      ticket: \"\""
      },
      {
        "title": "Root Cause Analysis Methods",
        "body": "Five Whys (simple incidents):\n\nWhy did users see errors? → API returned 500s\nWhy did API return 500s? → Database connection pool exhausted\nWhy was pool exhausted? → Long-running query held connections\nWhy was query long-running? → Missing index on new column\nWhy was index missing? → Migration didn't include index; no query performance review in CI\n\n→ Root cause: No automated query performance check in deployment pipeline\n→ Action: Add query plan analysis to CI for migration PRs\n\nFishbone / Ishikawa (complex incidents):\n\nCategories to investigate:\n├── People: Training? Fatigue? Communication?\n├── Process: Runbook? Escalation? Change management?\n├── Technology: Bug? Config? Capacity? Dependency?\n├── Environment: Network? Cloud provider? Third party?\n├── Monitoring: Detection gap? Alert fatigue? Dashboard gap?\n└── Testing: Test coverage? Load testing? Chaos testing?\n\nContributing Factor Categories:\n\nCategoryQuestionsTriggerWhat change or event started it?PropagationWhy did it spread? Why wasn't it contained?DetectionWhy wasn't it caught earlier?ResolutionWhat slowed the fix?ProcessWhat process gaps contributed?"
      },
      {
        "title": "Postmortem Review Meeting (60 min)",
        "body": "1. Timeline walk-through (15 min)\n   - Author presents chronology\n   - Attendees add context (\"I remember seeing X at this point\")\n\n2. Root cause deep-dive (15 min)  \n   - Do we agree on root cause?\n   - Are there additional contributing factors?\n\n3. Action item review (20 min)\n   - Are these the RIGHT actions?\n   - Are they prioritized correctly?\n   - Do owners agree on deadlines?\n\n4. Process improvements (10 min)\n   - Could we have detected this sooner?\n   - Could we have resolved this faster?\n   - What would have prevented this entirely?"
      },
      {
        "title": "Chaos Maturity Model",
        "body": "LevelNameActivities0NoneNo chaos testing1ExploratoryManual fault injection in staging2SystematicScheduled chaos experiments in staging3ProductionControlled chaos in production (Game Days)4ContinuousAutomated chaos in production with safety controls"
      },
      {
        "title": "Chaos Experiment Template",
        "body": "experiment:\n  name: \"\"\n  hypothesis: \"When [fault], the system will [expected behavior]\"\n  \n  steady_state:\n    metrics:\n      - name: \"\"\n        baseline: \"\"\n        acceptable_range: \"\"\n  \n  method:\n    fault_type: \"\"  # network | compute | storage | dependency | data\n    target: \"\"      # which service/component\n    blast_radius: \"\"  # single pod | single AZ | percentage of traffic\n    duration: \"\"\n    \n  safety:\n    abort_conditions:\n      - \"SLO burn rate exceeds 10x\"\n      - \"Customer-visible errors detected\"\n      - \"Alert fires that we didn't expect\"\n    rollback_plan: \"\"\n    required_approvals: []\n    \n  results:\n    outcome: \"\"  # confirmed | disproved | inconclusive\n    observations: []\n    action_items: []"
      },
      {
        "title": "Chaos Experiment Library",
        "body": "CategoryExperimentValidatesNetworkAdd 200ms latency to DB callsTimeout handling, circuit breakersNetworkDrop 5% of packets to downstreamRetry logic, error handlingNetworkDNS resolution failureCaching, fallback, error messagesComputeKill random pod every 10 minAuto-restart, load balancingComputeCPU stress to 95% on 1 nodeAuto-scaling, graceful degradationComputeFill disk to 95%Disk monitoring, log rotation, alertsStorageIncrease DB latency 5xConnection pool handling, timeoutsStorageSimulate cache failure (Redis down)Cache-aside pattern, DB fallbackDependencyBlock external API (payment provider)Circuit breaker, queuing, retryDependencyReturn 429s from auth serviceRate limit handling, backoffDataClock skew on subset of nodesTimestamp handling, orderingScale10x traffic spike over 5 minutesAuto-scaling speed, queue depth"
      },
      {
        "title": "Game Day Runbook",
        "body": "PRE-GAME (1 week before):\n□ Experiment designed and reviewed\n□ Steady-state metrics identified\n□ Abort conditions defined\n□ All participants briefed\n□ Runbacks tested in staging\n□ Stakeholders notified\n\nGAME DAY:\n□ Verify steady state (15 min baseline)\n□ Announce in #engineering: \"Chaos Game Day starting\"\n□ Inject fault\n□ Observe and document\n□ If abort condition hit → rollback immediately\n□ Run for planned duration\n□ Remove fault\n□ Verify recovery to steady state\n\nPOST-GAME (same day):\n□ Results documented\n□ Surprises noted\n□ Action items created\n□ Share findings in team meeting"
      },
      {
        "title": "Toil Identification",
        "body": "Definition: Work that is manual, repetitive, automatable, tactical, without enduring value, and scales linearly with service growth."
      },
      {
        "title": "Toil Inventory Template",
        "body": "toil_item:\n  name: \"\"\n  category: \"\"  # deployment | scaling | config | data | access | monitoring | recovery\n  frequency: \"\"  # daily | weekly | monthly | per-incident\n  time_per_occurrence_min: 0\n  occurrences_per_month: 0\n  total_hours_per_month: 0\n  teams_affected: []\n  automation_difficulty: \"\"  # low | medium | high\n  automation_value: 0  # hours saved per month\n  priority_score: 0  # value / difficulty"
      },
      {
        "title": "Toil Reduction Priority Matrix",
        "body": "Low EffortMedium EffortHigh EffortHigh Value (>10 hrs/mo)DO FIRSTDO SECONDPLANMed Value (2-10 hrs/mo)DO SECONDPLANEVALUATELow Value (<2 hrs/mo)QUICK WINSKIPSKIP"
      },
      {
        "title": "Common Toil Targets (Ranked by Impact)",
        "body": "Manual deployments → CI/CD pipeline + GitOps\nAccess provisioning → Self-service + auto-approval for low-risk\nCertificate renewals → Auto-renewal (cert-manager, Let's Encrypt)\nScaling decisions → HPA + predictive auto-scaling\nLog investigation → Structured logging + correlation + dashboards\nData fixes → Self-service admin tools + validation at ingestion\nConfig changes → Config-as-code + automated rollout\nIncident response → Automated runbooks for known issues\nCapacity reporting → Automated dashboards + forecasting\nOn-call triage → Noise reduction + auto-remediation for known patterns"
      },
      {
        "title": "Toil Budget Rule",
        "body": "Target: <25% of SRE time spent on toil. Track monthly. If above 25%, prioritize automation over all feature work."
      },
      {
        "title": "Capacity Model Template",
        "body": "capacity_model:\n  service: \"\"\n  bottleneck_resource: \"\"  # CPU | memory | storage | connections | bandwidth\n  \n  current_state:\n    peak_utilization_pct: 0\n    headroom_pct: 0\n    cost_per_month_usd: 0\n    \n  growth_forecast:\n    metric: \"\"  # MAU | requests/sec | storage_gb\n    current: 0\n    monthly_growth_pct: 0\n    projected_6mo: 0\n    projected_12mo: 0\n    \n  scaling_strategy:\n    type: \"\"  # horizontal | vertical | hybrid\n    auto_scaling: true\n    min_instances: 0\n    max_instances: 0\n    scale_up_threshold: 80  # % utilization\n    scale_down_threshold: 30\n    cooldown_seconds: 300\n    \n  cost_projection:\n    current_monthly: 0\n    projected_6mo_monthly: 0\n    projected_12mo_monthly: 0"
      },
      {
        "title": "Capacity Planning Cadence",
        "body": "FrequencyActionDailyReview auto-scaling events, check for anomaliesWeeklyReview utilization trends, spot-check headroomMonthlyUpdate growth model, review cost projectionsQuarterlyFull capacity review, budget planning, architecture checkPre-launchLoad test to 2x expected peak, verify scaling"
      },
      {
        "title": "Load Testing Benchmarks",
        "body": "ScenarioMethodDurationTargetBaselineSteady load at current peak30 minEstablish metricsGrowth2x current peak15 minVerify scaling worksSpike10x normal in 60 seconds5 minCircuit breakers holdSoak1.5x normal load4 hoursNo memory leaks, degradationStressRamp until failureUntil breakFind actual limits"
      },
      {
        "title": "On-Call Health Metrics",
        "body": "MetricHealthyWarningCriticalPages per shift<22-5>5Off-hours pages<1/week1-3/week>3/weekTime to acknowledge<5 min5-15 min>15 minTime to mitigate<30 min30-60 min>60 minFalse positive rate<10%10-30%>30%Escalation rate<20%20-40%>40%On-call satisfaction>4/53-4/5<3/5"
      },
      {
        "title": "On-Call Rotation Best Practices",
        "body": "Minimum rotation size: 5 people (one week on, four weeks off)\nNo back-to-back weeks unless team is too small (fix the team size)\nFollow-the-sun for global teams (no one pages at 3 AM if avoidable)\nPrimary + secondary on-call always\nHandoff document at rotation change — open issues, recent deploys, known risks\nCompensation — on-call pay, time off in lieu, or equivalent"
      },
      {
        "title": "On-Call Handoff Template",
        "body": "## On-Call Handoff: [Date]\n\n### Open Issues\n- [Issue]: [Status, next steps]\n\n### Recent Changes (last 7 days)\n- [Deployment/config change]: [Risk level, rollback plan]\n\n### Known Risks\n- [Event/condition]: [What to watch for]\n\n### Scheduled Maintenance\n- [When]: [What, duration, rollback plan]\n\n### Runbook Updates\n- [Any new/updated runbooks since last rotation]"
      },
      {
        "title": "Runbook Template",
        "body": "runbook:\n  title: \"\"\n  alert_name: \"\"  # exact alert that triggers this\n  last_updated: \"\"\n  owner: \"\"\n  \n  overview: |\n    What this alert means in plain English.\n    \n  impact: |\n    What users/systems are affected and how.\n    \n  diagnosis:\n    - step: \"Check service health\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"\"\n    - step: \"Check recent deployments\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"Rollback: [command]\"\n    - step: \"Check dependencies\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"\"\n      \n  mitigation:\n    - option: \"Rollback\"\n      when: \"Recent deployment suspected\"\n      steps: []\n    - option: \"Scale up\"\n      when: \"Traffic spike\"\n      steps: []\n    - option: \"Failover\"\n      when: \"Single component failure\"\n      steps: []\n      \n  escalation:\n    after_minutes: 30\n    contact: \"\"\n    context_to_provide: \"\""
      },
      {
        "title": "Weekly SRE Review (30 min)",
        "body": "1. SLO Status (5 min)\n   - Budget remaining per service\n   - Any burn rate alerts this week?\n\n2. Incident Review (10 min)\n   - Incidents this week: count, severity, duration\n   - Open postmortem action items: status check\n\n3. On-Call Health (5 min)\n   - Pages this week (total, off-hours, false positives)\n   - Any on-call feedback?\n\n4. Reliability Work (10 min)\n   - Automation shipped this week\n   - Toil reduced (hours saved)\n   - Chaos experiments run\n   - Capacity concerns"
      },
      {
        "title": "Monthly Reliability Report",
        "body": "monthly_report:\n  period: \"\"\n  \n  slo_summary:\n    services_meeting_slo: 0\n    services_breaching_slo: 0\n    worst_performing: \"\"\n    \n  incidents:\n    total: 0\n    by_severity: { SEV1: 0, SEV2: 0, SEV3: 0, SEV4: 0 }\n    mttr_minutes: 0\n    mttd_minutes: 0\n    repeat_incidents: 0\n    \n  error_budget:\n    services_in_healthy: 0\n    services_in_warning: 0\n    services_in_critical: 0\n    services_exhausted: 0\n    \n  toil:\n    hours_spent: 0\n    hours_automated_away: 0\n    pct_of_sre_time: 0\n    \n  on_call:\n    total_pages: 0\n    off_hours_pages: 0\n    false_positive_pct: 0\n    avg_ack_time_min: 0\n    \n  action_items:\n    open: 0\n    completed_this_month: 0\n    overdue: 0\n    \n  highlights: []\n  concerns: []\n  next_month_priorities: []"
      },
      {
        "title": "Production Readiness Review Checklist",
        "body": "Before any new service goes to production:\n\nCategoryCheckStatusSLOsSLIs defined and measuredSLOsSLO targets set with stakeholder agreementSLOsError budget policy documentedMonitoringGolden signals dashboardedMonitoringAlerting configured with runbooksMonitoringStructured logging implementedMonitoringDistributed tracing enabledIncidentsOn-call rotation establishedIncidentsEscalation paths documentedIncidentsRunbooks for top 5 failure modesCapacityLoad tested to 2x expected peakCapacityAuto-scaling configured and testedCapacityResource limits set (CPU, memory)ResilienceGraceful degradation implementedResilienceCircuit breakers for dependenciesResilienceRetry with exponential backoffResilienceTimeout configured for all external callsDeployRollback tested and documentedDeployCanary/blue-green deployment readyDeployFeature flags for risky featuresSecurityAuthentication and authorizationSecuritySecrets in vault (not env vars)SecurityDependencies scannedDataBackup and restore testedDataData retention policy definedDocsArchitecture diagram currentDocsAPI documentation publishedDocsOperational runbook complete"
      },
      {
        "title": "Self-Healing Automation",
        "body": "auto_remediation:\n  - trigger: \"pod_crash_loop\"\n    condition: \"restart_count > 3 in 10 min\"\n    action: \"Delete pod, let scheduler reschedule\"\n    escalate_if: \"Still crashing after 3 auto-remediations\"\n    \n  - trigger: \"disk_usage_high\"\n    condition: \"disk_usage > 85%\"\n    action: \"Run log cleanup script, archive old data\"\n    escalate_if: \"Still above 85% after cleanup\"\n    \n  - trigger: \"connection_pool_exhausted\"\n    condition: \"available_connections = 0\"\n    action: \"Kill idle connections, increase pool temporarily\"\n    escalate_if: \"Pool exhausted again within 1 hour\"\n    \n  - trigger: \"certificate_expiring\"\n    condition: \"days_until_expiry < 14\"\n    action: \"Trigger cert renewal\"\n    escalate_if: \"Renewal fails\""
      },
      {
        "title": "Multi-Region Reliability",
        "body": "StrategyComplexityRTOCostActive-passiveLowMinutes1.5xActive-active readMediumSeconds1.8xActive-active fullHighNear-zero2-3xCell-basedVery highPer-cell2-4x\n\nDecision guide:\n\nSLO < 99.9% → Single region with good backups\nSLO 99.9-99.95% → Active-passive with automated failover\nSLO > 99.95% → Active-active (read or full)\nSLO > 99.99% → Cell-based architecture"
      },
      {
        "title": "Reliability Culture Indicators",
        "body": "Healthy signals:\n\nPostmortems are blameless and well-attended\nError budgets are respected (feature freeze actually happens)\nOn-call is shared fairly and compensated\nToil is tracked and reducing quarter-over-quarter\nChaos experiments happen regularly\nTeams own their reliability (not just SRE)\n\nWarning signs:\n\n\"Hero culture\" — same person always saves the day\nPostmortems are blame-focused or skipped\nError budget exhaustion doesn't change behavior\nOn-call is dreaded, same 2 people always paged\n\"We'll fix reliability after this feature ships\" (always)\nSRE team is just an ops team with a new name"
      },
      {
        "title": "Quality Scoring Rubric (0-100)",
        "body": "DimensionWeight0-23-45SLO Coverage20%No SLOsSLOs for critical servicesAll services with SLOs, error budgets, reviewsMonitoring15%Basic health checksGolden signals + dashboardsFull observability stack + anomaly detectionIncident Response15%Ad-hoc, no processICS roles, runbooks, postmortemsStructured ICS, blameless culture, action trackingAutomation15%Manual everythingCI/CD + some automationSelf-healing, GitOps, <25% toilChaos Engineering10%NoneStaging experimentsContinuous production chaos with safetyCapacity Planning10%ReactiveQuarterly forecastingPredictive, auto-scaling, cost-optimizedOn-Call Health10%Burnout, hero cultureFair rotation, <5 pages/shiftBalanced, compensated, <2 pages/shiftDocumentation5%Nothing writtenRunbooks existComplete, current, tested runbooks"
      },
      {
        "title": "Natural Language Commands",
        "body": "\"Assess reliability for [service]\" → Run maturity assessment\n\"Define SLOs for [service]\" → Walk through SLI selection + SLO setting\n\"Check error budget for [service]\" → Calculate current budget status\n\"Start incident for [description]\" → Create incident channel, assign IC, begin workflow\n\"Write postmortem for [incident]\" → Generate structured postmortem\n\"Plan chaos experiment for [service]\" → Design experiment with hypothesis\n\"Audit toil for [team]\" → Inventory and prioritize toil\n\"Review on-call health\" → Analyze page volume, satisfaction, fairness\n\"Production readiness review for [service]\" → Run full checklist\n\"Monthly reliability report\" → Generate comprehensive report\n\"Design runbook for [alert]\" → Create structured runbook\n\"Plan capacity for [service] growing at [X%]\" → Build capacity model"
      }
    ],
    "body": "SRE & Incident Management Platform\n\nComplete Site Reliability Engineering system — from SLO definition through incident response, chaos engineering, and operational excellence. Zero dependencies.\n\nPhase 1: Reliability Assessment\n\nBefore building anything, assess where you are.\n\nService Catalog Entry\nservice:\n  name: \"\"\n  tier: \"\"  # critical | important | standard | experimental\n  owner_team: \"\"\n  oncall_rotation: \"\"\n  dependencies:\n    upstream: []    # services we call\n    downstream: []  # services that call us\n  data_classification: \"\"  # public | internal | confidential | restricted\n  deployment_frequency: \"\"  # daily | weekly | biweekly | monthly\n  architecture: \"\"  # monolith | microservice | serverless | hybrid\n  language: \"\"\n  infra: \"\"  # k8s | ECS | Lambda | VM | bare-metal\n  traffic_pattern: \"\"  # steady | diurnal | spiky | seasonal\n  peak_rps: 0\n  storage_gb: 0\n  monthly_cost_usd: 0\n\nMaturity Assessment (Score 1-5 per dimension)\nDimension\t1 (Ad-hoc)\t3 (Defined)\t5 (Optimized)\tScore\nSLOs\tNo SLOs defined\tSLOs exist, reviewed quarterly\tData-driven SLOs, auto error budgets\t\nMonitoring\tBasic health checks\tGolden signals + dashboards\tFull observability, anomaly detection\t\nIncident Response\tNo runbooks, hero culture\tDocumented process, postmortems\tAutomated detection, structured ICS\t\nAutomation\tManual deployments\tCI/CD pipeline, some automation\tSelf-healing, auto-scaling, GitOps\t\nChaos Engineering\tNo testing\tBasic failure injection\tContinuous chaos in production\t\nCapacity Planning\tReactive scaling\tQuarterly forecasting\tPredictive auto-scaling\t\nToil Management\t>50% toil\tToil tracked, reduction plans\t<25% toil, systematic elimination\t\nOn-Call Health\tBurnout, 24/7 individuals\tRotation exists, escalation paths\tBalanced load, <2 pages/shift\t\n\nScore interpretation:\n\n8-16: Firefighting mode — start with SLOs + incident process\n17-24: Foundation built — add chaos engineering + toil reduction\n25-32: Maturing — optimize error budgets + capacity planning\n33-40: Advanced — focus on predictive reliability + culture\nPhase 2: SLI/SLO Framework\nSLI Selection by Service Type\nService Type\tPrimary SLI\tSecondary SLIs\nAPI/Backend\tRequest success rate\tLatency p50/p95/p99, throughput\nFrontend/Web\tPage load (LCP)\tFID/INP, CLS, error rate\nData Pipeline\tFreshness\tCorrectness, completeness, throughput\nStorage\tDurability\tAvailability, latency\nStreaming\tProcessing latency\tThroughput, ordering, data loss rate\nBatch Job\tSuccess rate\tDuration, SLA compliance\nML Model\tPrediction latency\tAccuracy drift, feature freshness\nSLI Specification Template\nsli:\n  name: \"request_success_rate\"\n  description: \"Proportion of valid requests served successfully\"\n  type: \"availability\"  # availability | latency | quality | freshness\n  measurement:\n    good_events: \"HTTP responses with status < 500\"\n    total_events: \"All HTTP requests excluding health checks\"\n    source: \"load balancer access logs\"\n    aggregation: \"sum(good) / sum(total) over rolling 28-day window\"\n  exclusions:\n    - \"Health check endpoints (/healthz, /readyz)\"\n    - \"Synthetic monitoring traffic\"\n    - \"Requests from blocked IPs\"\n    - \"4xx responses (client errors)\"\n\nSLO Target Selection Guide\nNines\tUptime %\tDowntime/month\tAppropriate for\n2 nines\t99%\t7h 18m\tInternal tools, dev environments\n2.5\t99.5%\t3h 39m\tNon-critical services, backoffice\n3 nines\t99.9%\t43m 50s\tStandard production services\n3.5\t99.95%\t21m 55s\tImportant customer-facing services\n4 nines\t99.99%\t4m 23s\tCritical services, payments, auth\n5 nines\t99.999%\t26s\tLife-safety, financial clearing\n\nRules for setting targets:\n\nStart lower than you think — you can always tighten\nSLO < SLA (always have buffer — typically 0.1-0.5% margin)\nInternal SLO < External SLO (catch problems before customers do)\nEach nine costs ~10x more to achieve\nIf you can't measure it, you can't SLO it\nSLO Document Template\nslo:\n  service: \"\"\n  sli: \"\"\n  target: 99.9  # percentage\n  window: \"28d\"  # rolling window\n  error_budget: 0.1  # 100% - target\n  error_budget_minutes: 40  # per 28-day window\n  \n  burn_rate_alerts:\n    - name: \"fast_burn\"\n      burn_rate: 14.4  # exhausts budget in 2 hours\n      short_window: \"5m\"\n      long_window: \"1h\"\n      severity: \"page\"\n    - name: \"medium_burn\"\n      burn_rate: 6.0   # exhausts budget in ~5 hours\n      short_window: \"30m\"\n      long_window: \"6h\"\n      severity: \"page\"\n    - name: \"slow_burn\"\n      burn_rate: 1.0   # exhausts budget in 28 days\n      short_window: \"6h\"\n      long_window: \"3d\"\n      severity: \"ticket\"\n  \n  review_cadence: \"monthly\"\n  owner: \"\"\n  stakeholders: []\n  \n  escalation_when_budget_exhausted:\n    - \"Halt non-critical deployments\"\n    - \"Redirect engineering to reliability work\"\n    - \"Escalate to VP Engineering if no improvement in 48h\"\n\nPhase 3: Error Budget Management\nError Budget Policy\nerror_budget_policy:\n  service: \"\"\n  \n  budget_states:\n    healthy:\n      condition: \"remaining_budget > 50%\"\n      actions:\n        - \"Normal development velocity\"\n        - \"Feature work prioritized\"\n        - \"Chaos experiments allowed\"\n    \n    warning:\n      condition: \"remaining_budget 25-50%\"\n      actions:\n        - \"Increase monitoring scrutiny\"\n        - \"Review recent changes for risk\"\n        - \"Limit risky deployments to business hours\"\n        - \"No chaos experiments\"\n    \n    critical:\n      condition: \"remaining_budget 0-25%\"\n      actions:\n        - \"Feature freeze — reliability work only\"\n        - \"All deployments require SRE approval\"\n        - \"Mandatory rollback plan for every change\"\n        - \"Daily error budget review\"\n    \n    exhausted:\n      condition: \"remaining_budget <= 0\"\n      actions:\n        - \"Complete deployment freeze\"\n        - \"All engineering redirected to reliability\"\n        - \"VP Engineering notified\"\n        - \"Postmortem required for budget exhaustion\"\n        - \"Freeze maintained until budget recovers to 10%\"\n  \n  exceptions:\n    - \"Security patches always allowed\"\n    - \"Regulatory compliance changes always allowed\"\n    - \"Data loss prevention always allowed\"\n  \n  reset: \"Rolling 28-day window (no manual resets)\"\n\nBurn Rate Calculation\nBurn rate = (error rate observed) / (error rate allowed by SLO)\n\nExample:\n- SLO: 99.9% (error budget = 0.1%)\n- Current error rate: 0.5%\n- Burn rate = 0.5% / 0.1% = 5x\n\nAt 5x burn rate → budget exhausted in 28d / 5 = 5.6 days\n\nError Budget Dashboard\n\nTrack weekly:\n\nMetric\tCurrent\tTrend\tStatus\nBudget remaining (%)\t\t↑↓→\t🟢🟡🔴\nBudget consumed this week\t\t\t\nBurn rate (1h / 6h / 24h)\t\t\t\nIncidents consuming budget\t\t\t\nTop error contributor\t\t\t\nProjected exhaustion date\t\t\t\nPhase 4: Monitoring & Alerting Architecture\nFour Golden Signals\nSignal\tWhat to Measure\tAlert When\nLatency\tp50, p95, p99 response time\tp99 > 2x baseline for 5 min\nTraffic\tRequests/sec, concurrent users\t>30% drop (indicates upstream issue) OR >50% spike\nErrors\t5xx rate, timeout rate, exception rate\tError rate > SLO burn rate threshold\nSaturation\tCPU, memory, disk, connections, queue depth\t>80% sustained for 10 min\nUSE Method (Infrastructure)\n\nFor every resource, track:\n\nUtilization: % of capacity used (0-100%)\nSaturation: queue depth / wait time (0 = no waiting)\nErrors: error count / error rate\nRED Method (Services)\n\nFor every service, track:\n\nRate: requests per second\nErrors: failed requests per second\nDuration: latency distribution\nAlert Design Rules\nEvery alert must have a runbook link — no exceptions\nEvery alert must be actionable — if you can't act on it, delete it\nSymptoms over causes — alert on \"users can't check out\" not \"database CPU high\"\nMulti-window, multi-burn-rate — avoid single-threshold alerts\nPage only for customer impact — everything else is a ticket\nAlert fatigue = death — review alert volume monthly; target <5 pages/week per service\nAlert Severity Guide\nSeverity\tResponse Time\tNotification\tExamples\nP0/Page\t<5 min\tPagerDuty + phone\tSLO burn rate critical, data loss, security breach\nP1/Urgent\t<30 min\tSlack + PagerDuty\tDegraded service, elevated errors, capacity warning\nP2/Ticket\tNext business day\tTicket auto-created\tSlow burn, non-critical component down\nP3/Log\tWeekly review\tDashboard only\tInformational, trend detection\nStructured Log Standard\n{\n  \"timestamp\": \"2026-02-17T11:24:00.000Z\",\n  \"level\": \"error\",\n  \"service\": \"payment-api\",\n  \"trace_id\": \"abc123\",\n  \"span_id\": \"def456\",\n  \"message\": \"Payment processing failed\",\n  \"error_type\": \"TimeoutException\",\n  \"error_message\": \"Gateway timeout after 30s\",\n  \"http_method\": \"POST\",\n  \"http_path\": \"/api/v1/payments\",\n  \"http_status\": 504,\n  \"duration_ms\": 30012,\n  \"customer_id\": \"cust_xxx\",\n  \"payment_id\": \"pay_yyy\",\n  \"amount_cents\": 4999,\n  \"retry_count\": 2,\n  \"environment\": \"production\",\n  \"host\": \"payment-api-7b4d9-xk2p1\",\n  \"region\": \"us-east-1\"\n}\n\nPhase 5: Incident Response Framework\nSeverity Classification Matrix\n\tImpact: 1 User\tImpact: <25% Users\tImpact: >25% Users\tImpact: All Users\nCore function down\tSEV3\tSEV2\tSEV1\tSEV1\nDegraded performance\tSEV4\tSEV3\tSEV2\tSEV1\nNon-core feature down\tSEV4\tSEV3\tSEV3\tSEV2\nCosmetic/minor\tSEV4\tSEV4\tSEV3\tSEV3\n\nAuto-escalation triggers:\n\nAny data loss → SEV1 minimum\nSecurity breach with PII → SEV1\nRevenue-impacting → SEV1 or SEV2\nSLA breach imminent → auto-escalate one level\nIncident Command System (ICS)\nRole\tResponsibility\tAssigned\nIncident Commander (IC)\tOwns resolution, makes decisions, manages timeline\t\nCommunications Lead\tStatus updates, stakeholder comms, customer-facing\t\nOperations Lead\tHands-on-keyboard, executing fixes\t\nSubject Matter Expert\tDeep knowledge of affected system\t\nScribe\tDocumenting timeline, actions, decisions\t\n\nIC Rules:\n\nIC does NOT debug — IC coordinates\nIC makes final decisions when team disagrees\nIC can escalate severity at any time\nIC owns handoff if rotation changes\nIC calls end-of-incident\nIncident Response Workflow\nDETECT → TRIAGE → RESPOND → MITIGATE → RESOLVE → REVIEW\n\nStep 1: DETECT (0-5 min)\n├── Alert fires OR user report received\n├── On-call acknowledges within SLA\n└── Quick assessment: is this real? What severity?\n\nStep 2: TRIAGE (5-15 min)\n├── Classify severity using matrix above\n├── Assign IC and roles\n├── Open incident channel (#inc-YYYY-MM-DD-title)\n├── Post initial status update\n└── Start timeline document\n\nStep 3: RESPOND (15 min - ongoing)\n├── IC briefs team: \"Here's what we know, here's what we don't\"\n├── Operations Lead begins investigation\n├── Check: recent deployments? Config changes? Dependency issues?\n├── Parallel investigation tracks if needed\n└── 15-minute check-ins for SEV1, 30-min for SEV2\n\nStep 4: MITIGATE (ASAP)\n├── Priority: STOP THE BLEEDING\n├── Options (fastest first):\n│   ├── Rollback last deployment\n│   ├── Feature flag disable\n│   ├── Traffic shift / failover\n│   ├── Scale up / circuit breaker\n│   └── Manual data fix\n├── Mitigated ≠ Resolved — temporary fix is OK\n└── Update status: \"Impact mitigated, root cause investigation ongoing\"\n\nStep 5: RESOLVE\n├── Root cause identified and fixed\n├── Verification: SLIs back to normal for 30+ minutes\n├── All-clear communicated\n└── IC declares incident resolved\n\nStep 6: REVIEW (within 5 business days)\n├── Blameless postmortem written\n├── Action items assigned with owners and deadlines\n├── Postmortem review meeting\n└── Action items tracked to completion\n\nCommunication Templates\n\nInitial notification (internal):\n\n🔴 INCIDENT: [Title]\nSeverity: SEV[X]\nImpact: [Who/what is affected]\nStatus: Investigating\nIC: [Name]\nChannel: #inc-[date]-[slug]\nNext update: [time]\n\n\nCustomer-facing status:\n\n[Service] - Investigating increased error rates\n\nWe are currently investigating reports of [symptom]. \nSome users may experience [user-visible impact].\nOur team is actively working on a resolution.\nWe will provide an update within [time].\n\n\nResolution notification:\n\n✅ RESOLVED: [Title]\nDuration: [X hours Y minutes]\nImpact: [Summary]\nRoot cause: [One sentence]\nPostmortem: [Link] (within 5 business days)\n\nPhase 6: Postmortem Framework\nBlameless Postmortem Template\npostmortem:\n  title: \"\"\n  date: \"\"\n  severity: \"\"  # SEV1-4\n  duration: \"\"  # total incident duration\n  authors: []\n  reviewers: []\n  status: \"draft\"  # draft | in-review | final\n  \n  summary: |\n    One paragraph: what happened, what was the impact, how was it resolved.\n  \n  impact:\n    users_affected: 0\n    duration_minutes: 0\n    revenue_impact_usd: 0\n    slo_budget_consumed_pct: 0\n    data_loss: false\n    customer_tickets: 0\n  \n  timeline:\n    - time: \"\"\n      event: \"\"\n      # Chronological, every significant event\n      # Include detection time, escalation, mitigation attempts\n  \n  root_cause: |\n    Technical explanation of WHY it happened.\n    Go deep — surface causes are not root causes.\n  \n  contributing_factors:\n    - \"\"  # What made it worse or delayed resolution?\n  \n  detection:\n    how_detected: \"\"  # alert | user report | manual check\n    time_to_detect_minutes: 0\n    could_have_detected_sooner: \"\"\n  \n  resolution:\n    how_resolved: \"\"\n    time_to_mitigate_minutes: 0\n    time_to_resolve_minutes: 0\n  \n  what_went_well:\n    - \"\"  # Explicitly call out what worked\n  \n  what_went_wrong:\n    - \"\"\n  \n  where_we_got_lucky:\n    - \"\"  # Things that could have made it worse\n  \n  action_items:\n    - id: \"AI-001\"\n      type: \"\"  # prevent | detect | mitigate | process\n      description: \"\"\n      owner: \"\"\n      priority: \"\"  # P0 | P1 | P2\n      deadline: \"\"\n      status: \"open\"  # open | in-progress | done\n      ticket: \"\"\n\nRoot Cause Analysis Methods\n\nFive Whys (simple incidents):\n\nWhy did users see errors? → API returned 500s\nWhy did API return 500s? → Database connection pool exhausted\nWhy was pool exhausted? → Long-running query held connections\nWhy was query long-running? → Missing index on new column\nWhy was index missing? → Migration didn't include index; no query performance review in CI\n\n→ Root cause: No automated query performance check in deployment pipeline → Action: Add query plan analysis to CI for migration PRs\n\nFishbone / Ishikawa (complex incidents):\n\nCategories to investigate:\n├── People: Training? Fatigue? Communication?\n├── Process: Runbook? Escalation? Change management?\n├── Technology: Bug? Config? Capacity? Dependency?\n├── Environment: Network? Cloud provider? Third party?\n├── Monitoring: Detection gap? Alert fatigue? Dashboard gap?\n└── Testing: Test coverage? Load testing? Chaos testing?\n\n\nContributing Factor Categories:\n\nCategory\tQuestions\nTrigger\tWhat change or event started it?\nPropagation\tWhy did it spread? Why wasn't it contained?\nDetection\tWhy wasn't it caught earlier?\nResolution\tWhat slowed the fix?\nProcess\tWhat process gaps contributed?\nPostmortem Review Meeting (60 min)\n1. Timeline walk-through (15 min)\n   - Author presents chronology\n   - Attendees add context (\"I remember seeing X at this point\")\n\n2. Root cause deep-dive (15 min)  \n   - Do we agree on root cause?\n   - Are there additional contributing factors?\n\n3. Action item review (20 min)\n   - Are these the RIGHT actions?\n   - Are they prioritized correctly?\n   - Do owners agree on deadlines?\n\n4. Process improvements (10 min)\n   - Could we have detected this sooner?\n   - Could we have resolved this faster?\n   - What would have prevented this entirely?\n\nPhase 7: Chaos Engineering\nChaos Maturity Model\nLevel\tName\tActivities\n0\tNone\tNo chaos testing\n1\tExploratory\tManual fault injection in staging\n2\tSystematic\tScheduled chaos experiments in staging\n3\tProduction\tControlled chaos in production (Game Days)\n4\tContinuous\tAutomated chaos in production with safety controls\nChaos Experiment Template\nexperiment:\n  name: \"\"\n  hypothesis: \"When [fault], the system will [expected behavior]\"\n  \n  steady_state:\n    metrics:\n      - name: \"\"\n        baseline: \"\"\n        acceptable_range: \"\"\n  \n  method:\n    fault_type: \"\"  # network | compute | storage | dependency | data\n    target: \"\"      # which service/component\n    blast_radius: \"\"  # single pod | single AZ | percentage of traffic\n    duration: \"\"\n    \n  safety:\n    abort_conditions:\n      - \"SLO burn rate exceeds 10x\"\n      - \"Customer-visible errors detected\"\n      - \"Alert fires that we didn't expect\"\n    rollback_plan: \"\"\n    required_approvals: []\n    \n  results:\n    outcome: \"\"  # confirmed | disproved | inconclusive\n    observations: []\n    action_items: []\n\nChaos Experiment Library\nCategory\tExperiment\tValidates\nNetwork\tAdd 200ms latency to DB calls\tTimeout handling, circuit breakers\nNetwork\tDrop 5% of packets to downstream\tRetry logic, error handling\nNetwork\tDNS resolution failure\tCaching, fallback, error messages\nCompute\tKill random pod every 10 min\tAuto-restart, load balancing\nCompute\tCPU stress to 95% on 1 node\tAuto-scaling, graceful degradation\nCompute\tFill disk to 95%\tDisk monitoring, log rotation, alerts\nStorage\tIncrease DB latency 5x\tConnection pool handling, timeouts\nStorage\tSimulate cache failure (Redis down)\tCache-aside pattern, DB fallback\nDependency\tBlock external API (payment provider)\tCircuit breaker, queuing, retry\nDependency\tReturn 429s from auth service\tRate limit handling, backoff\nData\tClock skew on subset of nodes\tTimestamp handling, ordering\nScale\t10x traffic spike over 5 minutes\tAuto-scaling speed, queue depth\nGame Day Runbook\nPRE-GAME (1 week before):\n□ Experiment designed and reviewed\n□ Steady-state metrics identified\n□ Abort conditions defined\n□ All participants briefed\n□ Runbacks tested in staging\n□ Stakeholders notified\n\nGAME DAY:\n□ Verify steady state (15 min baseline)\n□ Announce in #engineering: \"Chaos Game Day starting\"\n□ Inject fault\n□ Observe and document\n□ If abort condition hit → rollback immediately\n□ Run for planned duration\n□ Remove fault\n□ Verify recovery to steady state\n\nPOST-GAME (same day):\n□ Results documented\n□ Surprises noted\n□ Action items created\n□ Share findings in team meeting\n\nPhase 8: Toil Management\nToil Identification\n\nDefinition: Work that is manual, repetitive, automatable, tactical, without enduring value, and scales linearly with service growth.\n\nToil Inventory Template\ntoil_item:\n  name: \"\"\n  category: \"\"  # deployment | scaling | config | data | access | monitoring | recovery\n  frequency: \"\"  # daily | weekly | monthly | per-incident\n  time_per_occurrence_min: 0\n  occurrences_per_month: 0\n  total_hours_per_month: 0\n  teams_affected: []\n  automation_difficulty: \"\"  # low | medium | high\n  automation_value: 0  # hours saved per month\n  priority_score: 0  # value / difficulty\n\nToil Reduction Priority Matrix\n\tLow Effort\tMedium Effort\tHigh Effort\nHigh Value (>10 hrs/mo)\tDO FIRST\tDO SECOND\tPLAN\nMed Value (2-10 hrs/mo)\tDO SECOND\tPLAN\tEVALUATE\nLow Value (<2 hrs/mo)\tQUICK WIN\tSKIP\tSKIP\nCommon Toil Targets (Ranked by Impact)\nManual deployments → CI/CD pipeline + GitOps\nAccess provisioning → Self-service + auto-approval for low-risk\nCertificate renewals → Auto-renewal (cert-manager, Let's Encrypt)\nScaling decisions → HPA + predictive auto-scaling\nLog investigation → Structured logging + correlation + dashboards\nData fixes → Self-service admin tools + validation at ingestion\nConfig changes → Config-as-code + automated rollout\nIncident response → Automated runbooks for known issues\nCapacity reporting → Automated dashboards + forecasting\nOn-call triage → Noise reduction + auto-remediation for known patterns\nToil Budget Rule\n\nTarget: <25% of SRE time spent on toil. Track monthly. If above 25%, prioritize automation over all feature work.\n\nPhase 9: Capacity Planning\nCapacity Model Template\ncapacity_model:\n  service: \"\"\n  bottleneck_resource: \"\"  # CPU | memory | storage | connections | bandwidth\n  \n  current_state:\n    peak_utilization_pct: 0\n    headroom_pct: 0\n    cost_per_month_usd: 0\n    \n  growth_forecast:\n    metric: \"\"  # MAU | requests/sec | storage_gb\n    current: 0\n    monthly_growth_pct: 0\n    projected_6mo: 0\n    projected_12mo: 0\n    \n  scaling_strategy:\n    type: \"\"  # horizontal | vertical | hybrid\n    auto_scaling: true\n    min_instances: 0\n    max_instances: 0\n    scale_up_threshold: 80  # % utilization\n    scale_down_threshold: 30\n    cooldown_seconds: 300\n    \n  cost_projection:\n    current_monthly: 0\n    projected_6mo_monthly: 0\n    projected_12mo_monthly: 0\n\nCapacity Planning Cadence\nFrequency\tAction\nDaily\tReview auto-scaling events, check for anomalies\nWeekly\tReview utilization trends, spot-check headroom\nMonthly\tUpdate growth model, review cost projections\nQuarterly\tFull capacity review, budget planning, architecture check\nPre-launch\tLoad test to 2x expected peak, verify scaling\nLoad Testing Benchmarks\nScenario\tMethod\tDuration\tTarget\nBaseline\tSteady load at current peak\t30 min\tEstablish metrics\nGrowth\t2x current peak\t15 min\tVerify scaling works\nSpike\t10x normal in 60 seconds\t5 min\tCircuit breakers hold\nSoak\t1.5x normal load\t4 hours\tNo memory leaks, degradation\nStress\tRamp until failure\tUntil break\tFind actual limits\nPhase 10: On-Call Excellence\nOn-Call Health Metrics\nMetric\tHealthy\tWarning\tCritical\nPages per shift\t<2\t2-5\t>5\nOff-hours pages\t<1/week\t1-3/week\t>3/week\nTime to acknowledge\t<5 min\t5-15 min\t>15 min\nTime to mitigate\t<30 min\t30-60 min\t>60 min\nFalse positive rate\t<10%\t10-30%\t>30%\nEscalation rate\t<20%\t20-40%\t>40%\nOn-call satisfaction\t>4/5\t3-4/5\t<3/5\nOn-Call Rotation Best Practices\nMinimum rotation size: 5 people (one week on, four weeks off)\nNo back-to-back weeks unless team is too small (fix the team size)\nFollow-the-sun for global teams (no one pages at 3 AM if avoidable)\nPrimary + secondary on-call always\nHandoff document at rotation change — open issues, recent deploys, known risks\nCompensation — on-call pay, time off in lieu, or equivalent\nOn-Call Handoff Template\n## On-Call Handoff: [Date]\n\n### Open Issues\n- [Issue]: [Status, next steps]\n\n### Recent Changes (last 7 days)\n- [Deployment/config change]: [Risk level, rollback plan]\n\n### Known Risks\n- [Event/condition]: [What to watch for]\n\n### Scheduled Maintenance\n- [When]: [What, duration, rollback plan]\n\n### Runbook Updates\n- [Any new/updated runbooks since last rotation]\n\nRunbook Template\nrunbook:\n  title: \"\"\n  alert_name: \"\"  # exact alert that triggers this\n  last_updated: \"\"\n  owner: \"\"\n  \n  overview: |\n    What this alert means in plain English.\n    \n  impact: |\n    What users/systems are affected and how.\n    \n  diagnosis:\n    - step: \"Check service health\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"\"\n    - step: \"Check recent deployments\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"Rollback: [command]\"\n    - step: \"Check dependencies\"\n      command: \"\"\n      expected: \"\"\n      if_unexpected: \"\"\n      \n  mitigation:\n    - option: \"Rollback\"\n      when: \"Recent deployment suspected\"\n      steps: []\n    - option: \"Scale up\"\n      when: \"Traffic spike\"\n      steps: []\n    - option: \"Failover\"\n      when: \"Single component failure\"\n      steps: []\n      \n  escalation:\n    after_minutes: 30\n    contact: \"\"\n    context_to_provide: \"\"\n\nPhase 11: Reliability Review & Governance\nWeekly SRE Review (30 min)\n1. SLO Status (5 min)\n   - Budget remaining per service\n   - Any burn rate alerts this week?\n\n2. Incident Review (10 min)\n   - Incidents this week: count, severity, duration\n   - Open postmortem action items: status check\n\n3. On-Call Health (5 min)\n   - Pages this week (total, off-hours, false positives)\n   - Any on-call feedback?\n\n4. Reliability Work (10 min)\n   - Automation shipped this week\n   - Toil reduced (hours saved)\n   - Chaos experiments run\n   - Capacity concerns\n\nMonthly Reliability Report\nmonthly_report:\n  period: \"\"\n  \n  slo_summary:\n    services_meeting_slo: 0\n    services_breaching_slo: 0\n    worst_performing: \"\"\n    \n  incidents:\n    total: 0\n    by_severity: { SEV1: 0, SEV2: 0, SEV3: 0, SEV4: 0 }\n    mttr_minutes: 0\n    mttd_minutes: 0\n    repeat_incidents: 0\n    \n  error_budget:\n    services_in_healthy: 0\n    services_in_warning: 0\n    services_in_critical: 0\n    services_exhausted: 0\n    \n  toil:\n    hours_spent: 0\n    hours_automated_away: 0\n    pct_of_sre_time: 0\n    \n  on_call:\n    total_pages: 0\n    off_hours_pages: 0\n    false_positive_pct: 0\n    avg_ack_time_min: 0\n    \n  action_items:\n    open: 0\n    completed_this_month: 0\n    overdue: 0\n    \n  highlights: []\n  concerns: []\n  next_month_priorities: []\n\nProduction Readiness Review Checklist\n\nBefore any new service goes to production:\n\nCategory\tCheck\tStatus\nSLOs\tSLIs defined and measured\t\nSLOs\tSLO targets set with stakeholder agreement\t\nSLOs\tError budget policy documented\t\nMonitoring\tGolden signals dashboarded\t\nMonitoring\tAlerting configured with runbooks\t\nMonitoring\tStructured logging implemented\t\nMonitoring\tDistributed tracing enabled\t\nIncidents\tOn-call rotation established\t\nIncidents\tEscalation paths documented\t\nIncidents\tRunbooks for top 5 failure modes\t\nCapacity\tLoad tested to 2x expected peak\t\nCapacity\tAuto-scaling configured and tested\t\nCapacity\tResource limits set (CPU, memory)\t\nResilience\tGraceful degradation implemented\t\nResilience\tCircuit breakers for dependencies\t\nResilience\tRetry with exponential backoff\t\nResilience\tTimeout configured for all external calls\t\nDeploy\tRollback tested and documented\t\nDeploy\tCanary/blue-green deployment ready\t\nDeploy\tFeature flags for risky features\t\nSecurity\tAuthentication and authorization\t\nSecurity\tSecrets in vault (not env vars)\t\nSecurity\tDependencies scanned\t\nData\tBackup and restore tested\t\nData\tData retention policy defined\t\nDocs\tArchitecture diagram current\t\nDocs\tAPI documentation published\t\nDocs\tOperational runbook complete\t\nPhase 12: Advanced Patterns\nSelf-Healing Automation\nauto_remediation:\n  - trigger: \"pod_crash_loop\"\n    condition: \"restart_count > 3 in 10 min\"\n    action: \"Delete pod, let scheduler reschedule\"\n    escalate_if: \"Still crashing after 3 auto-remediations\"\n    \n  - trigger: \"disk_usage_high\"\n    condition: \"disk_usage > 85%\"\n    action: \"Run log cleanup script, archive old data\"\n    escalate_if: \"Still above 85% after cleanup\"\n    \n  - trigger: \"connection_pool_exhausted\"\n    condition: \"available_connections = 0\"\n    action: \"Kill idle connections, increase pool temporarily\"\n    escalate_if: \"Pool exhausted again within 1 hour\"\n    \n  - trigger: \"certificate_expiring\"\n    condition: \"days_until_expiry < 14\"\n    action: \"Trigger cert renewal\"\n    escalate_if: \"Renewal fails\"\n\nMulti-Region Reliability\nStrategy\tComplexity\tRTO\tCost\nActive-passive\tLow\tMinutes\t1.5x\nActive-active read\tMedium\tSeconds\t1.8x\nActive-active full\tHigh\tNear-zero\t2-3x\nCell-based\tVery high\tPer-cell\t2-4x\n\nDecision guide:\n\nSLO < 99.9% → Single region with good backups\nSLO 99.9-99.95% → Active-passive with automated failover\nSLO > 99.95% → Active-active (read or full)\nSLO > 99.99% → Cell-based architecture\nReliability Culture Indicators\n\nHealthy signals:\n\nPostmortems are blameless and well-attended\nError budgets are respected (feature freeze actually happens)\nOn-call is shared fairly and compensated\nToil is tracked and reducing quarter-over-quarter\nChaos experiments happen regularly\nTeams own their reliability (not just SRE)\n\nWarning signs:\n\n\"Hero culture\" — same person always saves the day\nPostmortems are blame-focused or skipped\nError budget exhaustion doesn't change behavior\nOn-call is dreaded, same 2 people always paged\n\"We'll fix reliability after this feature ships\" (always)\nSRE team is just an ops team with a new name\nQuality Scoring Rubric (0-100)\nDimension\tWeight\t0-2\t3-4\t5\nSLO Coverage\t20%\tNo SLOs\tSLOs for critical services\tAll services with SLOs, error budgets, reviews\nMonitoring\t15%\tBasic health checks\tGolden signals + dashboards\tFull observability stack + anomaly detection\nIncident Response\t15%\tAd-hoc, no process\tICS roles, runbooks, postmortems\tStructured ICS, blameless culture, action tracking\nAutomation\t15%\tManual everything\tCI/CD + some automation\tSelf-healing, GitOps, <25% toil\nChaos Engineering\t10%\tNone\tStaging experiments\tContinuous production chaos with safety\nCapacity Planning\t10%\tReactive\tQuarterly forecasting\tPredictive, auto-scaling, cost-optimized\nOn-Call Health\t10%\tBurnout, hero culture\tFair rotation, <5 pages/shift\tBalanced, compensated, <2 pages/shift\nDocumentation\t5%\tNothing written\tRunbooks exist\tComplete, current, tested runbooks\nNatural Language Commands\n\"Assess reliability for [service]\" → Run maturity assessment\n\"Define SLOs for [service]\" → Walk through SLI selection + SLO setting\n\"Check error budget for [service]\" → Calculate current budget status\n\"Start incident for [description]\" → Create incident channel, assign IC, begin workflow\n\"Write postmortem for [incident]\" → Generate structured postmortem\n\"Plan chaos experiment for [service]\" → Design experiment with hypothesis\n\"Audit toil for [team]\" → Inventory and prioritize toil\n\"Review on-call health\" → Analyze page volume, satisfaction, fairness\n\"Production readiness review for [service]\" → Run full checklist\n\"Monthly reliability report\" → Generate comprehensive report\n\"Design runbook for [alert]\" → Create structured runbook\n\"Plan capacity for [service] growing at [X%]\" → Build capacity model"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-sre-platform",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-sre-platform",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-sre-platform",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-sre-platform",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-sre-platform/agent.md"
  }
}