{
  "schemaVersion": "1.0",
  "item": {
    "slug": "incident-commander",
    "name": "Incident Commander",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/alirezarezvani/incident-commander",
    "canonicalUrl": "https://clawhub.ai/alirezarezvani/incident-commander",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/incident-commander",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=incident-commander",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md",
      "assets/incident_report_template.md",
      "assets/runbook_template.md",
      "assets/sample_incident_classification.json",
      "assets/sample_incident_data.json"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/incident-commander"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/incident-commander",
    "agentPageUrl": "https://openagent3.xyz/skills/incident-commander/agent",
    "manifestUrl": "https://openagent3.xyz/skills/incident-commander/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/incident-commander/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Incident Commander Skill",
        "body": "Category: Engineering Team\nTier: POWERFUL\nAuthor: Claude Skills Team\nVersion: 1.0.0\nLast Updated: February 2026"
      },
      {
        "title": "Overview",
        "body": "The Incident Commander skill provides a comprehensive incident response framework for managing technology incidents from detection through resolution and post-incident review. This skill implements battle-tested practices from SRE and DevOps teams at scale, providing structured tools for severity classification, timeline reconstruction, and thorough post-incident analysis."
      },
      {
        "title": "Key Features",
        "body": "Automated Severity Classification - Intelligent incident triage based on impact and urgency metrics\nTimeline Reconstruction - Transform scattered logs and events into coherent incident narratives\nPost-Incident Review Generation - Structured PIRs with multiple RCA frameworks\nCommunication Templates - Pre-built templates for stakeholder updates and escalations\nRunbook Integration - Generate actionable runbooks from incident patterns"
      },
      {
        "title": "Core Tools",
        "body": "Incident Classifier (incident_classifier.py)\n\nAnalyzes incident descriptions and outputs severity levels\nRecommends response teams and initial actions\nGenerates communication templates based on severity\n\n\n\nTimeline Reconstructor (timeline_reconstructor.py)\n\nProcesses timestamped events from multiple sources\nReconstructs chronological incident timeline\nIdentifies gaps and provides duration analysis\n\n\n\nPIR Generator (pir_generator.py)\n\nCreates comprehensive Post-Incident Review documents\nApplies multiple RCA frameworks (5 Whys, Fishbone, Timeline)\nGenerates actionable follow-up items"
      },
      {
        "title": "Severity Classification System",
        "body": "SEV1 - Critical Outage\n\nDefinition: Complete service failure affecting all users or critical business functions\n\nCharacteristics:\n\nCustomer-facing services completely unavailable\nData loss or corruption affecting users\nSecurity breaches with customer data exposure\nRevenue-generating systems down\nSLA violations with financial penalties\n\nResponse Requirements:\n\nImmediate escalation to on-call engineer\nIncident Commander assigned within 5 minutes\nExecutive notification within 15 minutes\nPublic status page update within 15 minutes\nWar room established\nAll hands on deck if needed\n\nCommunication Frequency: Every 15 minutes until resolution\n\nSEV2 - Major Impact\n\nDefinition: Significant degradation affecting subset of users or non-critical functions\n\nCharacteristics:\n\nPartial service degradation (>25% of users affected)\nPerformance issues causing user frustration\nNon-critical features unavailable\nInternal tools impacting productivity\nData inconsistencies not affecting user experience\n\nResponse Requirements:\n\nOn-call engineer response within 15 minutes\nIncident Commander assigned within 30 minutes\nStatus page update within 30 minutes\nStakeholder notification within 1 hour\nRegular team updates\n\nCommunication Frequency: Every 30 minutes during active response\n\nSEV3 - Minor Impact\n\nDefinition: Limited impact with workarounds available\n\nCharacteristics:\n\nSingle feature or component affected\n<25% of users impacted\nWorkarounds available\nPerformance degradation not significantly impacting UX\nNon-urgent monitoring alerts\n\nResponse Requirements:\n\nResponse within 2 hours during business hours\nNext business day response acceptable outside hours\nInternal team notification\nOptional status page update\n\nCommunication Frequency: At key milestones only\n\nSEV4 - Low Impact\n\nDefinition: Minimal impact, cosmetic issues, or planned maintenance\n\nCharacteristics:\n\nCosmetic bugs\nDocumentation issues\nLogging or monitoring gaps\nPerformance issues with no user impact\nDevelopment/test environment issues\n\nResponse Requirements:\n\nResponse within 1-2 business days\nStandard ticket/issue tracking\nNo special escalation required\n\nCommunication Frequency: Standard development cycle updates"
      },
      {
        "title": "Incident Commander Role",
        "body": "Primary Responsibilities\n\nCommand and Control\n\nOwn the incident response process\nMake critical decisions about resource allocation\nCoordinate between technical teams and stakeholders\nMaintain situational awareness across all response streams\n\n\n\nCommunication Hub\n\nProvide regular updates to stakeholders\nManage external communications (status pages, customer notifications)\nFacilitate effective communication between response teams\nShield responders from external distractions\n\n\n\nProcess Management\n\nEnsure proper incident tracking and documentation\nDrive toward resolution while maintaining quality\nCoordinate handoffs between team members\nPlan and execute rollback strategies if needed\n\n\n\nPost-Incident Leadership\n\nEnsure thorough post-incident reviews are conducted\nDrive implementation of preventive measures\nShare learnings with broader organization\n\nDecision-Making Framework\n\nEmergency Decisions (SEV1/2):\n\nIncident Commander has full authority\nBias toward action over analysis\nDocument decisions for later review\nConsult subject matter experts but don't get blocked\n\nResource Allocation:\n\nCan pull in any necessary team members\nAuthority to escalate to senior leadership\nCan approve emergency spend for external resources\nMake call on communication channels and timing\n\nTechnical Decisions:\n\nLean on technical leads for implementation details\nMake final calls on trade-offs between speed and risk\nApprove rollback vs. fix-forward strategies\nCoordinate testing and validation approaches"
      },
      {
        "title": "Communication Templates",
        "body": "Initial Incident Notification (SEV1/2)\n\nSubject: [SEV{severity}] {Service Name} - {Brief Description}\n\nIncident Details:\n- Start Time: {timestamp}\n- Severity: SEV{level}\n- Impact: {user impact description}\n- Current Status: {investigating/mitigating/resolved}\n\nTechnical Details:\n- Affected Services: {service list}\n- Symptoms: {what users are experiencing}\n- Initial Assessment: {suspected root cause if known}\n\nResponse Team:\n- Incident Commander: {name}\n- Technical Lead: {name}\n- SMEs Engaged: {list}\n\nNext Update: {timestamp}\nStatus Page: {link}\nWar Room: {bridge/chat link}\n\n---\n{Incident Commander Name}\n{Contact Information}\n\nExecutive Summary (SEV1)\n\nSubject: URGENT - Customer-Impacting Outage - {Service Name}\n\nExecutive Summary:\n{2-3 sentence description of customer impact and business implications}\n\nKey Metrics:\n- Time to Detection: {X minutes}\n- Time to Engagement: {X minutes} \n- Estimated Customer Impact: {number/percentage}\n- Current Status: {status}\n- ETA to Resolution: {time or \"investigating\"}\n\nLeadership Actions Required:\n- [ ] Customer communication approval\n- [ ] PR/Communications coordination  \n- [ ] Resource allocation decisions\n- [ ] External vendor engagement\n\nIncident Commander: {name} ({contact})\nNext Update: {time}\n\n---\nThis is an automated alert from our incident response system.\n\nCustomer Communication Template\n\nWe are currently experiencing {brief description of issue} affecting {scope of impact}. \n\nOur engineering team was alerted at {time} and is actively working to resolve the issue. We will provide updates every {frequency} until resolved.\n\nWhat we know:\n- {factual statement of impact}\n- {factual statement of scope}\n- {brief status of response}\n\nWhat we're doing:\n- {primary response action}\n- {secondary response action}\n\nWorkaround (if available):\n{workaround steps or \"No workaround currently available\"}\n\nWe apologize for the inconvenience and will share more information as it becomes available.\n\nNext update: {time}\nStatus page: {link}"
      },
      {
        "title": "Stakeholder Management",
        "body": "Stakeholder Classification\n\nInternal Stakeholders:\n\nEngineering Leadership - Technical decisions and resource allocation\nProduct Management - Customer impact assessment and feature implications\nCustomer Support - User communication and support ticket management\nSales/Account Management - Customer relationship management for enterprise clients\nExecutive Team - Business impact decisions and external communication approval\nLegal/Compliance - Regulatory reporting and liability assessment\n\nExternal Stakeholders:\n\nCustomers - Service availability and impact communication\nPartners - API availability and integration impacts\nVendors - Third-party service dependencies and support escalation\nRegulators - Compliance reporting for regulated industries\nPublic/Media - Transparency for public-facing outages\n\nCommunication Cadence by Stakeholder\n\nStakeholderSEV1SEV2SEV3SEV4Engineering LeadershipReal-time30min4hrsDailyExecutive Team15min1hrEODWeeklyCustomer SupportReal-time30min2hrsAs neededCustomers15min1hrOptionalNonePartners30min2hrsOptionalNone"
      },
      {
        "title": "Runbook Generation Framework",
        "body": "Dynamic Runbook Components\n\nDetection Playbooks\n\nMonitoring alert definitions\nTriage decision trees\nEscalation trigger points\nInitial response actions\n\n\n\nResponse Playbooks\n\nStep-by-step mitigation procedures\nRollback instructions\nValidation checkpoints\nCommunication checkpoints\n\n\n\nRecovery Playbooks\n\nService restoration procedures\nData consistency checks\nPerformance validation\nUser notification processes\n\nRunbook Template Structure\n\n# {Service/Component} Incident Response Runbook\n\n## Quick Reference\n- **Severity Indicators:** {list of conditions for each severity level}\n- **Key Contacts:** {on-call rotations and escalation paths}\n- **Critical Commands:** {list of emergency commands with descriptions}\n\n## Detection\n### Monitoring Alerts\n- {Alert name}: {description and thresholds}\n- {Alert name}: {description and thresholds}\n\n### Manual Detection Signs\n- {Symptom}: {what to look for and where}\n- {Symptom}: {what to look for and where}\n\n## Initial Response (0-15 minutes)\n1. **Assess Severity**\n   - [ ] Check {primary metric}\n   - [ ] Verify {secondary indicator}\n   - [ ] Classify as SEV{level} based on {criteria}\n\n2. **Establish Command**\n   - [ ] Page Incident Commander if SEV1/2\n   - [ ] Create incident tracking ticket\n   - [ ] Join war room: {link/bridge info}\n\n3. **Initial Investigation**\n   - [ ] Check recent deployments: {deployment log location}\n   - [ ] Review error logs: {log location and queries}\n   - [ ] Verify dependencies: {dependency check commands}\n\n## Mitigation Strategies\n### Strategy 1: {Name}\n**Use when:** {conditions}\n**Steps:**\n1. {detailed step with commands}\n2. {detailed step with expected outcomes}\n3. {validation step}\n\n**Rollback Plan:**\n1. {rollback step}\n2. {verification step}\n\n### Strategy 2: {Name}\n{similar structure}\n\n## Recovery and Validation\n1. **Service Restoration**\n   - [ ] {restoration step}\n   - [ ] Wait for {metric} to return to normal\n   - [ ] Validate end-to-end functionality\n\n2. **Communication**\n   - [ ] Update status page\n   - [ ] Notify stakeholders\n   - [ ] Schedule PIR\n\n## Common Pitfalls\n- **{Pitfall}:** {description and how to avoid}\n- **{Pitfall}:** {description and how to avoid}\n\n## Reference Information\n→ See references/reference-information.md for details\n\n## Usage Examples\n\n### Example 1: Database Connection Pool Exhaustion\n\n```bash\n# Classify the incident\necho '{\"description\": \"Users reporting 500 errors, database connections timing out\", \"affected_users\": \"80%\", \"business_impact\": \"high\"}' | python scripts/incident_classifier.py\n\n# Reconstruct timeline from logs\npython scripts/timeline_reconstructor.py --input assets/db_incident_events.json --output timeline.md\n\n# Generate PIR after resolution\npython scripts/pir_generator.py --incident assets/db_incident_data.json --timeline timeline.md --output pir.md"
      },
      {
        "title": "Example 2: API Rate Limiting Incident",
        "body": "# Quick classification from stdin\necho \"API rate limits causing customer API calls to fail\" | python scripts/incident_classifier.py --format text\n\n# Build timeline from multiple sources\npython scripts/timeline_reconstructor.py --input assets/api_incident_logs.json --detect-phases --gap-analysis\n\n# Generate comprehensive PIR\npython scripts/pir_generator.py --incident assets/api_incident_summary.json --rca-method fishbone --action-items"
      },
      {
        "title": "During Incident Response",
        "body": "Maintain Calm Leadership\n\nStay composed under pressure\nMake decisive calls with incomplete information\nCommunicate confidence while acknowledging uncertainty\n\n\n\nDocument Everything\n\nAll actions taken and their outcomes\nDecision rationale, especially for controversial calls\nTimeline of events as they happen\n\n\n\nEffective Communication\n\nUse clear, jargon-free language\nProvide regular updates even when there's no new information\nManage stakeholder expectations proactively\n\n\n\nTechnical Excellence\n\nPrefer rollbacks to risky fixes under pressure\nValidate fixes before declaring resolution\nPlan for secondary failures and cascading effects"
      },
      {
        "title": "Post-Incident",
        "body": "Blameless Culture\n\nFocus on system failures, not individual mistakes\nEncourage honest reporting of what went wrong\nCelebrate learning and improvement opportunities\n\n\n\nAction Item Discipline\n\nAssign specific owners and due dates\nTrack progress publicly\nPrioritize based on risk and effort\n\n\n\nKnowledge Sharing\n\nShare PIRs broadly within the organization\nUpdate runbooks based on lessons learned\nConduct training sessions for common failure modes\n\n\n\nContinuous Improvement\n\nLook for patterns across multiple incidents\nInvest in tooling and automation\nRegularly review and update processes"
      },
      {
        "title": "Monitoring and Alerting",
        "body": "PagerDuty/Opsgenie integration for escalation\nDatadog/Grafana for metrics and dashboards\nELK/Splunk for log analysis and correlation"
      },
      {
        "title": "Communication Platforms",
        "body": "Slack/Teams for war room coordination\nZoom/Meet for video bridges\nStatus page providers (Statuspage.io, etc.)"
      },
      {
        "title": "Documentation Systems",
        "body": "Confluence/Notion for PIR storage\nGitHub/GitLab for runbook version control\nJIRA/Linear for action item tracking"
      },
      {
        "title": "Change Management",
        "body": "CI/CD pipeline integration\nDeployment tracking systems\nFeature flag platforms for quick rollbacks"
      },
      {
        "title": "Conclusion",
        "body": "The Incident Commander skill provides a comprehensive framework for managing incidents from detection through post-incident review. By implementing structured processes, clear communication templates, and thorough analysis tools, teams can improve their incident response capabilities and build more resilient systems.\n\nThe key to successful incident management is preparation, practice, and continuous learning. Use this framework as a starting point, but adapt it to your organization's specific needs, culture, and technical environment.\n\nRemember: The goal isn't to prevent all incidents (which is impossible), but to detect them quickly, respond effectively, communicate clearly, and learn continuously."
      }
    ],
    "body": "Incident Commander Skill\n\nCategory: Engineering Team\nTier: POWERFUL\nAuthor: Claude Skills Team\nVersion: 1.0.0\nLast Updated: February 2026\n\nOverview\n\nThe Incident Commander skill provides a comprehensive incident response framework for managing technology incidents from detection through resolution and post-incident review. This skill implements battle-tested practices from SRE and DevOps teams at scale, providing structured tools for severity classification, timeline reconstruction, and thorough post-incident analysis.\n\nKey Features\nAutomated Severity Classification - Intelligent incident triage based on impact and urgency metrics\nTimeline Reconstruction - Transform scattered logs and events into coherent incident narratives\nPost-Incident Review Generation - Structured PIRs with multiple RCA frameworks\nCommunication Templates - Pre-built templates for stakeholder updates and escalations\nRunbook Integration - Generate actionable runbooks from incident patterns\nSkills Included\nCore Tools\n\nIncident Classifier (incident_classifier.py)\n\nAnalyzes incident descriptions and outputs severity levels\nRecommends response teams and initial actions\nGenerates communication templates based on severity\n\nTimeline Reconstructor (timeline_reconstructor.py)\n\nProcesses timestamped events from multiple sources\nReconstructs chronological incident timeline\nIdentifies gaps and provides duration analysis\n\nPIR Generator (pir_generator.py)\n\nCreates comprehensive Post-Incident Review documents\nApplies multiple RCA frameworks (5 Whys, Fishbone, Timeline)\nGenerates actionable follow-up items\nIncident Response Framework\nSeverity Classification System\nSEV1 - Critical Outage\n\nDefinition: Complete service failure affecting all users or critical business functions\n\nCharacteristics:\n\nCustomer-facing services completely unavailable\nData loss or corruption affecting users\nSecurity breaches with customer data exposure\nRevenue-generating systems down\nSLA violations with financial penalties\n\nResponse Requirements:\n\nImmediate escalation to on-call engineer\nIncident Commander assigned within 5 minutes\nExecutive notification within 15 minutes\nPublic status page update within 15 minutes\nWar room established\nAll hands on deck if needed\n\nCommunication Frequency: Every 15 minutes until resolution\n\nSEV2 - Major Impact\n\nDefinition: Significant degradation affecting subset of users or non-critical functions\n\nCharacteristics:\n\nPartial service degradation (>25% of users affected)\nPerformance issues causing user frustration\nNon-critical features unavailable\nInternal tools impacting productivity\nData inconsistencies not affecting user experience\n\nResponse Requirements:\n\nOn-call engineer response within 15 minutes\nIncident Commander assigned within 30 minutes\nStatus page update within 30 minutes\nStakeholder notification within 1 hour\nRegular team updates\n\nCommunication Frequency: Every 30 minutes during active response\n\nSEV3 - Minor Impact\n\nDefinition: Limited impact with workarounds available\n\nCharacteristics:\n\nSingle feature or component affected\n<25% of users impacted\nWorkarounds available\nPerformance degradation not significantly impacting UX\nNon-urgent monitoring alerts\n\nResponse Requirements:\n\nResponse within 2 hours during business hours\nNext business day response acceptable outside hours\nInternal team notification\nOptional status page update\n\nCommunication Frequency: At key milestones only\n\nSEV4 - Low Impact\n\nDefinition: Minimal impact, cosmetic issues, or planned maintenance\n\nCharacteristics:\n\nCosmetic bugs\nDocumentation issues\nLogging or monitoring gaps\nPerformance issues with no user impact\nDevelopment/test environment issues\n\nResponse Requirements:\n\nResponse within 1-2 business days\nStandard ticket/issue tracking\nNo special escalation required\n\nCommunication Frequency: Standard development cycle updates\n\nIncident Commander Role\nPrimary Responsibilities\n\nCommand and Control\n\nOwn the incident response process\nMake critical decisions about resource allocation\nCoordinate between technical teams and stakeholders\nMaintain situational awareness across all response streams\n\nCommunication Hub\n\nProvide regular updates to stakeholders\nManage external communications (status pages, customer notifications)\nFacilitate effective communication between response teams\nShield responders from external distractions\n\nProcess Management\n\nEnsure proper incident tracking and documentation\nDrive toward resolution while maintaining quality\nCoordinate handoffs between team members\nPlan and execute rollback strategies if needed\n\nPost-Incident Leadership\n\nEnsure thorough post-incident reviews are conducted\nDrive implementation of preventive measures\nShare learnings with broader organization\nDecision-Making Framework\n\nEmergency Decisions (SEV1/2):\n\nIncident Commander has full authority\nBias toward action over analysis\nDocument decisions for later review\nConsult subject matter experts but don't get blocked\n\nResource Allocation:\n\nCan pull in any necessary team members\nAuthority to escalate to senior leadership\nCan approve emergency spend for external resources\nMake call on communication channels and timing\n\nTechnical Decisions:\n\nLean on technical leads for implementation details\nMake final calls on trade-offs between speed and risk\nApprove rollback vs. fix-forward strategies\nCoordinate testing and validation approaches\nCommunication Templates\nInitial Incident Notification (SEV1/2)\nSubject: [SEV{severity}] {Service Name} - {Brief Description}\n\nIncident Details:\n- Start Time: {timestamp}\n- Severity: SEV{level}\n- Impact: {user impact description}\n- Current Status: {investigating/mitigating/resolved}\n\nTechnical Details:\n- Affected Services: {service list}\n- Symptoms: {what users are experiencing}\n- Initial Assessment: {suspected root cause if known}\n\nResponse Team:\n- Incident Commander: {name}\n- Technical Lead: {name}\n- SMEs Engaged: {list}\n\nNext Update: {timestamp}\nStatus Page: {link}\nWar Room: {bridge/chat link}\n\n---\n{Incident Commander Name}\n{Contact Information}\n\nExecutive Summary (SEV1)\nSubject: URGENT - Customer-Impacting Outage - {Service Name}\n\nExecutive Summary:\n{2-3 sentence description of customer impact and business implications}\n\nKey Metrics:\n- Time to Detection: {X minutes}\n- Time to Engagement: {X minutes} \n- Estimated Customer Impact: {number/percentage}\n- Current Status: {status}\n- ETA to Resolution: {time or \"investigating\"}\n\nLeadership Actions Required:\n- [ ] Customer communication approval\n- [ ] PR/Communications coordination  \n- [ ] Resource allocation decisions\n- [ ] External vendor engagement\n\nIncident Commander: {name} ({contact})\nNext Update: {time}\n\n---\nThis is an automated alert from our incident response system.\n\nCustomer Communication Template\nWe are currently experiencing {brief description of issue} affecting {scope of impact}. \n\nOur engineering team was alerted at {time} and is actively working to resolve the issue. We will provide updates every {frequency} until resolved.\n\nWhat we know:\n- {factual statement of impact}\n- {factual statement of scope}\n- {brief status of response}\n\nWhat we're doing:\n- {primary response action}\n- {secondary response action}\n\nWorkaround (if available):\n{workaround steps or \"No workaround currently available\"}\n\nWe apologize for the inconvenience and will share more information as it becomes available.\n\nNext update: {time}\nStatus page: {link}\n\nStakeholder Management\nStakeholder Classification\n\nInternal Stakeholders:\n\nEngineering Leadership - Technical decisions and resource allocation\nProduct Management - Customer impact assessment and feature implications\nCustomer Support - User communication and support ticket management\nSales/Account Management - Customer relationship management for enterprise clients\nExecutive Team - Business impact decisions and external communication approval\nLegal/Compliance - Regulatory reporting and liability assessment\n\nExternal Stakeholders:\n\nCustomers - Service availability and impact communication\nPartners - API availability and integration impacts\nVendors - Third-party service dependencies and support escalation\nRegulators - Compliance reporting for regulated industries\nPublic/Media - Transparency for public-facing outages\nCommunication Cadence by Stakeholder\nStakeholder\tSEV1\tSEV2\tSEV3\tSEV4\nEngineering Leadership\tReal-time\t30min\t4hrs\tDaily\nExecutive Team\t15min\t1hr\tEOD\tWeekly\nCustomer Support\tReal-time\t30min\t2hrs\tAs needed\nCustomers\t15min\t1hr\tOptional\tNone\nPartners\t30min\t2hrs\tOptional\tNone\nRunbook Generation Framework\nDynamic Runbook Components\n\nDetection Playbooks\n\nMonitoring alert definitions\nTriage decision trees\nEscalation trigger points\nInitial response actions\n\nResponse Playbooks\n\nStep-by-step mitigation procedures\nRollback instructions\nValidation checkpoints\nCommunication checkpoints\n\nRecovery Playbooks\n\nService restoration procedures\nData consistency checks\nPerformance validation\nUser notification processes\nRunbook Template Structure\n# {Service/Component} Incident Response Runbook\n\n## Quick Reference\n- **Severity Indicators:** {list of conditions for each severity level}\n- **Key Contacts:** {on-call rotations and escalation paths}\n- **Critical Commands:** {list of emergency commands with descriptions}\n\n## Detection\n### Monitoring Alerts\n- {Alert name}: {description and thresholds}\n- {Alert name}: {description and thresholds}\n\n### Manual Detection Signs\n- {Symptom}: {what to look for and where}\n- {Symptom}: {what to look for and where}\n\n## Initial Response (0-15 minutes)\n1. **Assess Severity**\n   - [ ] Check {primary metric}\n   - [ ] Verify {secondary indicator}\n   - [ ] Classify as SEV{level} based on {criteria}\n\n2. **Establish Command**\n   - [ ] Page Incident Commander if SEV1/2\n   - [ ] Create incident tracking ticket\n   - [ ] Join war room: {link/bridge info}\n\n3. **Initial Investigation**\n   - [ ] Check recent deployments: {deployment log location}\n   - [ ] Review error logs: {log location and queries}\n   - [ ] Verify dependencies: {dependency check commands}\n\n## Mitigation Strategies\n### Strategy 1: {Name}\n**Use when:** {conditions}\n**Steps:**\n1. {detailed step with commands}\n2. {detailed step with expected outcomes}\n3. {validation step}\n\n**Rollback Plan:**\n1. {rollback step}\n2. {verification step}\n\n### Strategy 2: {Name}\n{similar structure}\n\n## Recovery and Validation\n1. **Service Restoration**\n   - [ ] {restoration step}\n   - [ ] Wait for {metric} to return to normal\n   - [ ] Validate end-to-end functionality\n\n2. **Communication**\n   - [ ] Update status page\n   - [ ] Notify stakeholders\n   - [ ] Schedule PIR\n\n## Common Pitfalls\n- **{Pitfall}:** {description and how to avoid}\n- **{Pitfall}:** {description and how to avoid}\n\n## Reference Information\n→ See references/reference-information.md for details\n\n## Usage Examples\n\n### Example 1: Database Connection Pool Exhaustion\n\n```bash\n# Classify the incident\necho '{\"description\": \"Users reporting 500 errors, database connections timing out\", \"affected_users\": \"80%\", \"business_impact\": \"high\"}' | python scripts/incident_classifier.py\n\n# Reconstruct timeline from logs\npython scripts/timeline_reconstructor.py --input assets/db_incident_events.json --output timeline.md\n\n# Generate PIR after resolution\npython scripts/pir_generator.py --incident assets/db_incident_data.json --timeline timeline.md --output pir.md\n\nExample 2: API Rate Limiting Incident\n# Quick classification from stdin\necho \"API rate limits causing customer API calls to fail\" | python scripts/incident_classifier.py --format text\n\n# Build timeline from multiple sources\npython scripts/timeline_reconstructor.py --input assets/api_incident_logs.json --detect-phases --gap-analysis\n\n# Generate comprehensive PIR\npython scripts/pir_generator.py --incident assets/api_incident_summary.json --rca-method fishbone --action-items\n\nBest Practices\nDuring Incident Response\n\nMaintain Calm Leadership\n\nStay composed under pressure\nMake decisive calls with incomplete information\nCommunicate confidence while acknowledging uncertainty\n\nDocument Everything\n\nAll actions taken and their outcomes\nDecision rationale, especially for controversial calls\nTimeline of events as they happen\n\nEffective Communication\n\nUse clear, jargon-free language\nProvide regular updates even when there's no new information\nManage stakeholder expectations proactively\n\nTechnical Excellence\n\nPrefer rollbacks to risky fixes under pressure\nValidate fixes before declaring resolution\nPlan for secondary failures and cascading effects\nPost-Incident\n\nBlameless Culture\n\nFocus on system failures, not individual mistakes\nEncourage honest reporting of what went wrong\nCelebrate learning and improvement opportunities\n\nAction Item Discipline\n\nAssign specific owners and due dates\nTrack progress publicly\nPrioritize based on risk and effort\n\nKnowledge Sharing\n\nShare PIRs broadly within the organization\nUpdate runbooks based on lessons learned\nConduct training sessions for common failure modes\n\nContinuous Improvement\n\nLook for patterns across multiple incidents\nInvest in tooling and automation\nRegularly review and update processes\nIntegration with Existing Tools\nMonitoring and Alerting\nPagerDuty/Opsgenie integration for escalation\nDatadog/Grafana for metrics and dashboards\nELK/Splunk for log analysis and correlation\nCommunication Platforms\nSlack/Teams for war room coordination\nZoom/Meet for video bridges\nStatus page providers (Statuspage.io, etc.)\nDocumentation Systems\nConfluence/Notion for PIR storage\nGitHub/GitLab for runbook version control\nJIRA/Linear for action item tracking\nChange Management\nCI/CD pipeline integration\nDeployment tracking systems\nFeature flag platforms for quick rollbacks\nConclusion\n\nThe Incident Commander skill provides a comprehensive framework for managing incidents from detection through post-incident review. By implementing structured processes, clear communication templates, and thorough analysis tools, teams can improve their incident response capabilities and build more resilient systems.\n\nThe key to successful incident management is preparation, practice, and continuous learning. Use this framework as a starting point, but adapt it to your organization's specific needs, culture, and technical environment.\n\nRemember: The goal isn't to prevent all incidents (which is impossible), but to detect them quickly, respond effectively, communicate clearly, and learn continuously."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/alirezarezvani/incident-commander",
    "publisherUrl": "https://clawhub.ai/alirezarezvani/incident-commander",
    "owner": "alirezarezvani",
    "version": "2.1.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/incident-commander",
    "downloadUrl": "https://openagent3.xyz/downloads/incident-commander",
    "agentUrl": "https://openagent3.xyz/skills/incident-commander/agent",
    "manifestUrl": "https://openagent3.xyz/skills/incident-commander/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/incident-commander/agent.md"
  }
}