{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-observability-engine",
    "name": "Observability & Reliability Engineering",
    "source": "tencent",
    "type": "skill",
    "category": "其他",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-observability-engine",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-observability-engine",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-observability-engine",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-observability-engine",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-observability-engine"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-observability-engine",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Observability & Reliability Engineering",
        "body": "Complete system for building observable, reliable services — from structured logging to incident response to SLO-driven development."
      },
      {
        "title": "Quick Health Check (/16)",
        "body": "Score your current observability posture:\n\nSignalHealthy (2)Weak (1)Missing (0)Structured loggingJSON logs with trace_id correlationLogs exist but unstructuredConsole.log / print statementsMetrics collectionRED/USE metrics with dashboardsSome metrics, no dashboardsNo metricsDistributed tracingFull request path with samplingPartial traces, key services onlyNo tracingAlertingSLO-based alerts with runbooksThreshold alerts, some runbooksNo alerts or all-noiseIncident responseDefined process with roles + post-mortemsAd-hoc response, some docs\"Whoever notices fixes it\"SLOs definedSLOs with error budgets tracked weeklyInformal availability targetsNo reliability targetsOn-call rotationStructured rotation with escalationInformal \"call someone\"No on-callCost managementObservability budget tracked monthlySome awareness of costsNo idea what you spend\n\n12-16: Production-grade. Focus on optimization.\n8-11: Foundation exists. Fill the gaps systematically.\n4-7: Significant risk. Prioritize alerting + incident response.\n0-3: Flying blind. Start with Phase 1 immediately."
      },
      {
        "title": "Log Architecture",
        "body": "Application → Structured JSON → Log Router → Storage → Query Engine\n                                    ↓\n                              Alert Pipeline"
      },
      {
        "title": "Required Fields (Every Log Line)",
        "body": "FieldTypePurposeExampletimestampISO-8601 UTCWhen2026-02-22T18:30:00.123ZlevelenumSeverityinfo, warn, error, fatalservicestringWhich servicepayment-apiversionstringWhich deployv2.3.1environmentstringWhich envproductionmessagestringWhat happenedPayment processed successfullytrace_idstringRequest correlationabc123def456span_idstringOperation within tracespan_789duration_msnumberHow long142"
      },
      {
        "title": "Contextual Fields (Add Per Domain)",
        "body": "# HTTP request context\nhttp:\n  method: POST\n  path: /api/v1/orders\n  status: 201\n  client_ip: 203.0.113.42  # Anonymize in logs if needed\n  user_agent: \"Mozilla/5.0...\"\n  request_id: \"req_abc123\"\n\n# Business context\nbusiness:\n  user_id: \"usr_456\"\n  tenant_id: \"tenant_789\"\n  order_id: \"ord_012\"\n  action: \"checkout\"\n  amount_cents: 4999\n  currency: \"USD\"\n\n# Error context\nerror:\n  type: \"PaymentDeclinedError\"\n  message: \"Card declined: insufficient funds\"\n  code: \"CARD_DECLINED\"\n  stack: \"...\" # Only in non-production or DEBUG level\n  retry_count: 2\n  retryable: true"
      },
      {
        "title": "Log Level Decision Tree",
        "body": "Is the process about to crash?\n  → FATAL (exit after logging)\n\nDid an operation fail that needs human attention?\n  → ERROR (page someone or create ticket)\n\nDid something unexpected happen but we recovered?\n  → WARN (review in daily triage)\n\nIs this a normal business event worth recording?\n  → INFO (audit trail, business metrics)\n\nIs this useful for debugging but noisy in production?\n  → DEBUG (off in prod, on in staging)\n\nIs this only useful when stepping through code?\n  → TRACE (never in production)"
      },
      {
        "title": "Log Level Rules",
        "body": "ERROR means action required — if no one needs to act on it, it's WARN\nINFO is for business events — not internal implementation details\nNo logging inside tight loops — aggregate and log summary\nLog at boundaries — API entry/exit, queue consume/publish, DB calls\nNever log secrets — API keys, tokens, passwords, PII (see scrubbing below)"
      },
      {
        "title": "PII & Secret Scrubbing",
        "body": "scrub_patterns:\n  # Always redact\n  - field_patterns: [\"password\", \"secret\", \"token\", \"api_key\", \"authorization\"]\n    action: replace_with_redacted\n  \n  # Hash for correlation without exposure\n  - field_patterns: [\"email\", \"phone\", \"ssn\", \"national_id\"]\n    action: sha256_hash\n  \n  # Mask partially\n  - field_patterns: [\"credit_card\", \"card_number\"]\n    action: mask_last_4  # \"****-****-****-1234\"\n  \n  # IP anonymization\n  - field_patterns: [\"client_ip\", \"ip_address\"]\n    action: zero_last_octet  # 203.0.113.0"
      },
      {
        "title": "Logger Setup (By Language)",
        "body": "Node.js (Pino):\n\nimport pino from 'pino';\nimport { AsyncLocalStorage } from 'node:async_hooks';\n\nconst als = new AsyncLocalStorage<Record<string, string>>();\n\nconst logger = pino({\n  level: process.env.LOG_LEVEL || 'info',\n  formatters: {\n    level: (label) => ({ level: label }),\n  },\n  mixin: () => als.getStore() ?? {},\n  redact: ['req.headers.authorization', '*.password', '*.token'],\n  timestamp: pino.stdTimeFunctions.isoTime,\n});\n\n// Middleware: inject context\napp.use((req, res, next) => {\n  const ctx = {\n    trace_id: req.headers['x-trace-id'] || crypto.randomUUID(),\n    request_id: crypto.randomUUID(),\n    service: 'payment-api',\n    version: process.env.APP_VERSION,\n  };\n  als.run(ctx, () => next());\n});\n\nPython (structlog):\n\nimport structlog\nstructlog.configure(\n    processors=[\n        structlog.contextvars.merge_contextvars,\n        structlog.processors.add_log_level,\n        structlog.processors.TimeStamper(fmt=\"iso\", utc=True),\n        structlog.processors.JSONRenderer(),\n    ],\n)\nlog = structlog.get_logger()\n# Bind context per-request:\nstructlog.contextvars.bind_contextvars(trace_id=trace_id, user_id=user_id)\n\nGo (zerolog):\n\nlog := zerolog.New(os.Stdout).With().\n    Timestamp().\n    Str(\"service\", \"payment-api\").\n    Str(\"version\", version).\n    Logger()\n// Per-request:\nreqLog := log.With().Str(\"trace_id\", traceID).Logger()"
      },
      {
        "title": "Log Storage Decision",
        "body": "VolumeSolutionRetentionCost<10 GB/dayLoki + Grafana30 days hot, 90 days coldLow10-100 GB/dayElasticsearch / OpenSearch14 days hot, 90 days S3Medium100+ GB/dayClickHouse or Datadog7 days hot, 30 days archiveHighBudget-constrainedLoki + S3 backend90 days all coldVery low"
      },
      {
        "title": "10 Logging Anti-Patterns",
        "body": "#Anti-PatternFix1log.error(err) with no contextAlways include: what operation, what input, what state2Logging request/response bodiesLog only in DEBUG; redact sensitive fields3String concatenation in log messagesUse structured fields: log.info(\"processed\", { order_id, amount })4Catch-and-log-and-rethrowLog at the boundary where you handle it, not every layer5Different log formats per serviceStandardize schema across all services6No log rotation / retention policySet max size + TTL; archive to cold storage7Logging inside hot pathsAggregate: log summary every N items or every interval8Missing correlation IDsPropagate trace_id from first entry point through all services9Boolean log levels (verbose: true)Use standard levels with configurable minimum10Logging PII in plain textImplement scrubbing at the logger level"
      },
      {
        "title": "The RED Method (Request-Driven Services)",
        "body": "For every service endpoint, track:\n\nMetricWhatPrometheus ExampleRateRequests per secondhttp_requests_total{method, path, status}ErrorsFailed requests per secondhttp_requests_total{status=~\"5..\"} / totalDurationLatency distributionhttp_request_duration_seconds{method, path} (histogram)"
      },
      {
        "title": "The USE Method (Infrastructure Resources)",
        "body": "For every resource (CPU, memory, disk, network):\n\nMetricWhatExampleUtilization% resource busyCPU usage 78%SaturationQueue depth / backpressure12 requests queuedErrorsResource errors3 disk I/O errors"
      },
      {
        "title": "Golden Signals (Google SRE)",
        "body": "SignalMeaningSourceLatencyTime to serve requestsRED DurationTrafficDemand on the systemRED RateErrorsRate of failed requestsRED ErrorsSaturationHow \"full\" the service isUSE Saturation"
      },
      {
        "title": "Metric Types & When to Use Each",
        "body": "TypeUse CaseExampleCounterThings that only go upTotal requests, errors, bytes sentGaugeCurrent value that goes up/downActive connections, queue depth, temperatureHistogramDistribution of valuesRequest latency, response sizeSummaryPre-calculated percentilesClient-side latency (when you need exact percentiles)\n\nRule: Use histograms over summaries in most cases — they're aggregatable across instances."
      },
      {
        "title": "Naming Conventions",
        "body": "# Pattern: <namespace>_<subsystem>_<name>_<unit>\nhttp_server_request_duration_seconds\nhttp_server_requests_total\ndb_pool_connections_active\nqueue_messages_pending\ncache_hit_ratio\n\n# Rules:\n# 1. Use snake_case\n# 2. Include unit suffix (_seconds, _bytes, _total)\n# 3. _total suffix for counters\n# 4. Don't include label names in metric name\n# 5. Use base units (seconds not milliseconds, bytes not kilobytes)"
      },
      {
        "title": "Label Design Rules",
        "body": "RuleWhyExampleKeep cardinality <100 per labelHigh cardinality kills performancestatus=\"200\" not status=\"200 OK\"No user IDs as labelsUnbounded cardinalityUse log correlation insteadNo request paths with IDs/api/users/123 creates millions of seriesNormalize: /api/users/:idMax 5-7 labels per metricEach combo = a time series{method, path, status, service}"
      },
      {
        "title": "Instrumentation Checklist",
        "body": "application_metrics:\n  # HTTP layer\n  - http_request_duration_seconds: histogram {method, path, status}\n  - http_request_size_bytes: histogram {method, path}\n  - http_response_size_bytes: histogram {method, path}\n  - http_requests_in_flight: gauge\n  \n  # Business logic\n  - orders_processed_total: counter {status, payment_method}\n  - order_value_dollars: histogram {payment_method}\n  - user_signups_total: counter {source}\n  \n  # Dependencies\n  - db_query_duration_seconds: histogram {query_type, table}\n  - db_connections_active: gauge {pool}\n  - db_connections_idle: gauge {pool}\n  - cache_requests_total: counter {result: hit|miss}\n  - external_api_duration_seconds: histogram {service, endpoint}\n  - external_api_errors_total: counter {service, error_type}\n  \n  # Queue / async\n  - queue_messages_published_total: counter {queue}\n  - queue_messages_consumed_total: counter {queue, status}\n  - queue_processing_duration_seconds: histogram {queue}\n  - queue_depth: gauge {queue}\n  - queue_consumer_lag: gauge {queue, consumer_group}\n\ninfrastructure_metrics:\n  # Node exporter / cAdvisor provides these automatically\n  - cpu_usage_percent: gauge {instance}\n  - memory_usage_bytes: gauge {instance}\n  - disk_usage_bytes: gauge {instance, mount}\n  - disk_io_seconds: counter {instance, device}\n  - network_bytes: counter {instance, direction}\n  - container_cpu_usage: gauge {pod, container}\n  - container_memory_usage: gauge {pod, container}"
      },
      {
        "title": "Stack Recommendations",
        "body": "ComponentOptionsRecommendationCollectionPrometheus, OTEL Collector, Datadog AgentPrometheus (free) or OTEL Collector (vendor-neutral)StoragePrometheus, Thanos, Mimir, VictoriaMetricsVictoriaMetrics (best cost/perf) or Mimir (Grafana ecosystem)VisualizationGrafana, Datadog, New RelicGrafana (free, extensible)AlertingAlertmanager, Grafana Alerting, PagerDutyAlertmanager + PagerDuty routing"
      },
      {
        "title": "Trace Architecture",
        "body": "Client Request\n  → API Gateway (root span)\n    → Auth Service (child span)\n    → Order Service (child span)\n      → Database Query (child span)\n      → Payment Service (child span)\n        → Stripe API (child span)\n    → Notification Service (child span)\n      → Email Provider (child span)"
      },
      {
        "title": "OpenTelemetry Setup",
        "body": "Auto-instrumentation (Node.js):\n\n// tracing.ts — import BEFORE anything else\nimport { NodeSDK } from '@opentelemetry/sdk-node';\nimport { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';\nimport { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';\n\nconst sdk = new NodeSDK({\n  traceExporter: new OTLPTraceExporter({\n    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',\n  }),\n  instrumentations: [getNodeAutoInstrumentations({\n    '@opentelemetry/instrumentation-http': { ignoreIncomingPaths: ['/health', '/ready'] },\n    '@opentelemetry/instrumentation-express': { enabled: true },\n  })],\n  serviceName: process.env.OTEL_SERVICE_NAME || 'payment-api',\n});\nsdk.start();\n\nCustom spans for business logic:\n\nimport { trace, SpanStatusCode } from '@opentelemetry/api';\n\nconst tracer = trace.getTracer('payment-service');\n\nasync function processPayment(order: Order) {\n  return tracer.startActiveSpan('process-payment', async (span) => {\n    span.setAttributes({\n      'order.id': order.id,\n      'order.amount_cents': order.amountCents,\n      'payment.method': order.paymentMethod,\n    });\n    try {\n      const result = await chargeCard(order);\n      span.setAttributes({ 'payment.status': result.status });\n      return result;\n    } catch (err) {\n      span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });\n      span.recordException(err);\n      throw err;\n    } finally {\n      span.end();\n    }\n  });\n}"
      },
      {
        "title": "Sampling Strategies",
        "body": "StrategyWhenConfigAlways OnDev/staging, low traffic (<100 rps)ratio: 1.0ProbabilisticModerate traffic (100-1000 rps)ratio: 0.1 (10%)Rate-limitedHigh traffic (>1000 rps)max_traces_per_second: 100Tail-basedWant all errors + slow requestsCollector-side: keep if error OR duration > p99Parent-basedRespect upstream decisionsIf parent sampled, child sampled\n\nRecommendation: Start with parent-based + probabilistic (10%). Add tail-based at the collector to capture all errors."
      },
      {
        "title": "Context Propagation",
        "body": "HeaderStandardFormattraceparentW3C Trace Context00-{trace_id}-{span_id}-{flags}tracestateW3C Trace ContextVendor-specific key-value pairsb3Zipkin B3{trace_id}-{span_id}-{sampled}\n\nRule: Use W3C Trace Context (traceparent) as primary. Support B3 for legacy Zipkin systems."
      },
      {
        "title": "Trace Storage",
        "body": "VolumeSolutionRetention<50 GB/dayJaeger + Elasticsearch7 days50-500 GB/dayTempo + S314 days500+ GB/dayTempo + S3 with aggressive sampling7 daysBudget-constrainedJaeger + Badger (local disk)3 days"
      },
      {
        "title": "SLI Selection by Service Type",
        "body": "Service TypePrimary SLISecondary SLIMeasurementAPI / WebAvailability + LatencyError rateServer-side + syntheticData pipelineFreshness + CorrectnessThroughputPipeline timestamps + checksumsStorageDurability + AvailabilityLatencyChecksums + uptime monitoringStreamingThroughput + LatencyMessage loss rateConsumer lag + e2e latencyBatch jobsSuccess rate + FreshnessDurationJob scheduler metrics"
      },
      {
        "title": "SLO Definition Template",
        "body": "slo:\n  name: \"Payment API Availability\"\n  service: payment-api\n  owner: payments-team\n  \n  sli:\n    type: availability\n    definition: \"Proportion of non-5xx responses\"\n    measurement: |\n      sum(rate(http_requests_total{service=\"payment-api\",status!~\"5..\"}[5m]))\n      /\n      sum(rate(http_requests_total{service=\"payment-api\"}[5m]))\n    \n  target: 99.95%  # 21.9 min downtime/month\n  window: rolling_30d\n  \n  error_budget:\n    total_minutes: 21.9  # per 30 days\n    burn_rate_alerts:\n      - severity: critical\n        burn_rate: 14.4x  # Budget consumed in 2 hours\n        short_window: 5m\n        long_window: 1h\n      - severity: warning\n        burn_rate: 6x    # Budget consumed in 5 days\n        short_window: 30m\n        long_window: 6h\n      - severity: ticket\n        burn_rate: 1x    # Budget consumed in 30 days\n        short_window: 6h\n        long_window: 3d\n  \n  consequences:\n    budget_remaining_above_50pct: \"Normal development velocity\"\n    budget_remaining_20_to_50pct: \"Prioritize reliability work\"\n    budget_remaining_below_20pct: \"Feature freeze; reliability only\"\n    budget_exhausted: \"All hands on reliability until budget recovers\""
      },
      {
        "title": "Common SLO Targets",
        "body": "Service TierAvailabilityp50 Latencyp99 LatencyMonthly DowntimeTier 0 (payments, auth)99.99%<100ms<500ms4.3 minTier 1 (core API)99.95%<200ms<1s21.9 minTier 2 (non-critical)99.9%<500ms<2s43.8 minTier 3 (internal tools)99.5%<1s<5s3.6 hoursBatch / pipeline99% (success rate)N/AN/AN/A"
      },
      {
        "title": "Error Budget Tracking",
        "body": "# Weekly error budget review template\nerror_budget_review:\n  week: \"2026-W08\"\n  service: payment-api\n  slo_target: 99.95%\n  \n  budget:\n    total_minutes_this_period: 21.9\n    consumed_minutes: 8.2\n    remaining_minutes: 13.7\n    remaining_percent: 62.6%\n    \n  incidents_consuming_budget:\n    - date: \"2026-02-18\"\n      duration_minutes: 5.1\n      cause: \"Database connection pool exhaustion\"\n      preventable: true\n      action: \"Increase pool size + add saturation alert\"\n    - date: \"2026-02-20\"\n      duration_minutes: 3.1\n      cause: \"Upstream payment provider timeout\"\n      preventable: false\n      action: \"Add circuit breaker with fallback\"\n  \n  velocity_decision: \"Normal — 62.6% budget remaining\"\n  reliability_work_this_week:\n    - \"Add connection pool saturation alert\"\n    - \"Implement circuit breaker for payment provider\""
      },
      {
        "title": "Alert Quality Principles",
        "body": "Every alert must be actionable — if no one needs to act, it's not an alert\nEvery alert needs a runbook — linked directly in the alert annotation\nSymptom-based over cause-based — alert on \"users can't checkout\" not \"CPU high\"\nMulti-window burn rate — not static thresholds (see SLO alerts above)\nAlert on absence, not just presence — \"no orders in 15 min\" catches silent failures"
      },
      {
        "title": "Alert Severity Levels",
        "body": "SeverityResponse TimeChannelWhoExampleP0 — Critical<5 minPage (PagerDuty/Opsgenie)On-call engineerPayment system downP1 — High<30 minPage during business hours, Slack 24/7On-callError rate >5% for 10 minP2 — Medium<4 hoursSlack channelTeamp99 latency degraded 2xP3 — LowNext business dayTicket auto-createdTeam backlogDisk usage >80%InfoN/ADashboard onlyNo oneDeploy completed"
      },
      {
        "title": "Alerting Anti-Patterns",
        "body": "Anti-PatternProblemFixStatic CPU/memory thresholdsNoisy, not user-impactingUse SLO-based burn rate alertsAlert per instance50 instances = 50 alerts for same issueAggregate: alert on service-level error rateNo deduplicationSame alert fires 100 timesGroup by service + alert name; set repeat intervalMissing runbookEngineer gets paged, doesn't know what to doEvery alert links to a runbookThreshold too sensitiveFires on brief spikesUse for: 5m to require sustained conditionToo many P0sAlert fatigue → ignoring real incidentsAudit monthly; demote or remove noisy alerts"
      },
      {
        "title": "Alert Template (Prometheus Alertmanager)",
        "body": "groups:\n  - name: payment-api-slo\n    rules:\n      - alert: PaymentAPIHighErrorRate\n        expr: |\n          (\n            sum(rate(http_requests_total{service=\"payment-api\",status=~\"5..\"}[5m]))\n            /\n            sum(rate(http_requests_total{service=\"payment-api\"}[5m]))\n          ) > 0.01\n        for: 5m\n        labels:\n          severity: critical\n          service: payment-api\n          team: payments\n        annotations:\n          summary: \"Payment API error rate {{ $value | humanizePercentage }} (>1%)\"\n          description: \"5xx error rate has exceeded 1% for 5 minutes\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-errors\"\n          dashboard: \"https://grafana.internal/d/payment-api\"\n          \n      - alert: PaymentAPINoTraffic\n        expr: |\n          sum(rate(http_requests_total{service=\"payment-api\"}[15m])) == 0\n        for: 5m\n        labels:\n          severity: critical\n          service: payment-api\n        annotations:\n          summary: \"Payment API receiving zero traffic for 5 minutes\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-no-traffic\"\n\n      - alert: PaymentAPILatencyHigh\n        expr: |\n          histogram_quantile(0.99, \n            sum(rate(http_request_duration_seconds_bucket{service=\"payment-api\"}[5m])) by (le)\n          ) > 2\n        for: 10m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"Payment API p99 latency {{ $value }}s (>2s for 10min)\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-latency\""
      },
      {
        "title": "Runbook Template",
        "body": "# Runbook: PaymentAPIHighErrorRate\n\n## What This Alert Means\nThe payment API is returning >1% 5xx errors over a 5-minute window.\nUsers are likely failing to complete checkouts.\n\n## Impact\n- Users cannot process payments\n- Revenue loss: ~$X per minute (based on average traffic)\n- SLO: Payment API availability (target: 99.95%)\n\n## Immediate Actions\n1. Check the error dashboard: [link]\n2. Check recent deploys: `kubectl rollout history deployment/payment-api`\n3. Check upstream dependencies:\n   - Database: [dashboard link]\n   - Stripe API: [status page]\n   - Redis cache: [dashboard link]\n4. Check application logs:\n\nkubectl logs -l app=payment-api --since=10m | jq 'select(.level==\"error\")'\n\n## Common Causes & Fixes\n| Cause | Diagnosis | Fix |\n|-------|-----------|-----|\n| Bad deploy | Errors started at deploy time | `kubectl rollout undo deployment/payment-api` |\n| DB connection exhaustion | `db_connections_active` at max | Restart pods (rolling) + increase pool size |\n| Stripe outage | Stripe status page red | Enable fallback payment processor |\n| Memory leak | Memory climbing, OOMKilled events | Rolling restart + investigate |\n\n## Escalation\n- If unresolved after 15 min: page payment team lead\n- If revenue impact >$10K: page VP Engineering\n- If Stripe outage: communicate to support team for customer messaging\n\n## Resolution\n- Confirm error rate <0.1% for 10 min\n- Post in #incidents: root cause + duration + impact\n- Schedule post-mortem if downtime >5 min"
      },
      {
        "title": "Dashboard Hierarchy",
        "body": "L1: Executive / Business Dashboard (non-technical stakeholders)\n  ↓\nL2: Service Overview Dashboard (on-call, quick triage)\n  ↓\nL3: Service Deep-Dive Dashboard (debugging specific service)\n  ↓\nL4: Infrastructure Dashboard (resource-level details)"
      },
      {
        "title": "L1: Business Dashboard",
        "body": "panels:\n  - title: \"Revenue per Minute\"\n    type: stat\n    query: \"sum(rate(orders_total{status='completed'}[5m])) * avg(order_value_dollars)\"\n  - title: \"Active Users (5min)\"\n    type: stat\n    query: \"count(count by (user_id) (http_requests_total{...}[5m]))\"\n  - title: \"Checkout Success Rate\"\n    type: gauge\n    query: \"sum(rate(checkout_total{status='success'}[1h])) / sum(rate(checkout_total[1h]))\"\n    thresholds: [95, 98, 99.5]\n  - title: \"Error Budget Remaining\"\n    type: gauge\n    query: \"1 - (error_budget_consumed / error_budget_total)\""
      },
      {
        "title": "L2: Service Overview Dashboard",
        "body": "Every service gets one of these with identical layout:\n\nrow_1_traffic:\n  - \"Request Rate (rps)\" — timeseries, by status code\n  - \"Error Rate (%)\" — timeseries, threshold line at SLO\n  - \"Active Requests\" — gauge\n\nrow_2_latency:\n  - \"Latency Distribution\" — heatmap\n  - \"p50 / p95 / p99\" — timeseries, threshold lines\n  - \"Latency by Endpoint\" — table, sorted by p99\n\nrow_3_dependencies:\n  - \"Downstream Latency\" — timeseries per dependency\n  - \"Downstream Error Rate\" — timeseries per dependency\n  - \"Database Query Duration\" — timeseries by query type\n\nrow_4_resources:\n  - \"CPU Usage\" — timeseries per pod\n  - \"Memory Usage\" — timeseries per pod\n  - \"Pod Restarts\" — stat\n\nrow_5_business:\n  - \"Business Metric 1\" — service-specific\n  - \"Business Metric 2\" — service-specific"
      },
      {
        "title": "Dashboard Rules",
        "body": "Time range default: last 1 hour — most debugging happens in recent time\nVariable selectors at top: environment, service, instance\nConsistent color coding: green=good, yellow=degraded, red=bad across all dashboards\nLink alerts to dashboards — every alert annotation includes dashboard URL\nNo more than 15 panels per dashboard — split into L3 if needed\nInclude \"as of\" timestamp — so screenshots in incidents are unambiguous\nDashboard as code — store Grafana JSON in git, provision via API"
      },
      {
        "title": "Incident Severity Classification",
        "body": "SeverityCriteriaResponseCommunicationSEV-1Service down, data loss risk, security breachAll hands, war roomStatus page update every 15 minSEV-2Degraded service, SLO at risk, partial outageOn-call + backupStatus page update every 30 minSEV-3Minor degradation, workaround existsOn-call during hoursInternal Slack updateSEV-4Cosmetic, low impactNext sprintNone"
      },
      {
        "title": "Incident Roles",
        "body": "RoleResponsibilityWhoIncident Commander (IC)Owns the incident. Coordinates. Makes decisions.On-call leadTechnical LeadDiagnoses and fixes. Communicates technical status to IC.Senior engineerCommunications LeadUpdates status page, Slack, stakeholders.Product/supportScribeDocuments timeline, actions, decisions in real-time.Anyone available"
      },
      {
        "title": "Incident Response Workflow",
        "body": "1. DETECT\n   - Alert fires → on-call paged\n   - Customer report → support escalates\n   - Internal discovery → engineer reports\n   \n2. TRIAGE (first 5 minutes)\n   - Confirm the issue is real (not false alert)\n   - Classify severity (SEV-1 through SEV-4)\n   - Open incident channel: #inc-YYYY-MM-DD-short-description\n   - Assign roles (IC, Tech Lead, Comms)\n   \n3. MITIGATE (next 5-30 minutes)\n   - Goal: STOP THE BLEEDING, not find root cause\n   - Options (try in order):\n     a. Rollback last deploy\n     b. Scale up / restart pods\n     c. Toggle feature flag off\n     d. Redirect traffic / enable fallback\n     e. Manual data fix\n   - Document every action with timestamp\n   \n4. STABILIZE\n   - Confirm mitigation is working (metrics back to normal)\n   - Monitor for 15-30 min for recurrence\n   - Update status page: \"Monitoring fix\"\n   \n5. RESOLVE\n   - Confirm all metrics healthy for 30+ min\n   - Update status page: \"Resolved\"\n   - Schedule post-mortem (within 48 hours for SEV-1/2)\n   - Send internal summary to stakeholders"
      },
      {
        "title": "Incident Channel Template",
        "body": "📋 Incident: Payment API 5xx Errors\n🔴 Severity: SEV-2\n🕐 Started: 2026-02-22 14:23 UTC\n👤 IC: @alice\n🔧 Tech Lead: @bob\n📢 Comms: @charlie\n\nStatus: MITIGATING\nImpact: ~5% of checkout requests failing\nCustomer-facing: Yes\n\nTimeline:\n14:23 — Alert fired: PaymentAPIHighErrorRate\n14:25 — IC assigned: @alice, confirmed real via dashboard\n14:28 — Tech Lead: error logs show connection pool exhaustion post-deploy\n14:31 — Rolled back deployment v2.3.1 → v2.3.0\n14:35 — Error rate dropping, monitoring\n14:50 — Error rate <0.1%, marking resolved"
      },
      {
        "title": "Blameless Post-Mortem Template",
        "body": "post_mortem:\n  title: \"Payment API Connection Pool Exhaustion\"\n  date: \"2026-02-22\"\n  severity: SEV-2\n  duration: 27 minutes (14:23 — 14:50 UTC)\n  authors: [\"@alice\", \"@bob\"]\n  reviewers: [\"@engineering-leads\"]\n  status: action_items_in_progress\n  \n  summary: |\n    A deployment at 14:15 introduced a connection leak in the payment API.\n    Connection pool was exhausted by 14:23, causing 5xx errors for ~5% of\n    checkout requests. Rolled back at 14:31; recovered by 14:50.\n  \n  impact:\n    user_impact: \"~340 users saw checkout failures over 27 minutes\"\n    revenue_impact: \"$2,100 estimated (based on average order value × failed checkouts)\"\n    slo_impact: \"Consumed 5.1 min of 21.9 min monthly error budget (23%)\"\n    data_impact: \"No data loss. 12 orders failed; users could retry successfully.\"\n  \n  timeline:\n    - time: \"14:15\"\n      event: \"Deploy v2.3.1 rolled out (3/3 pods updated)\"\n    - time: \"14:23\"\n      event: \"PaymentAPIHighErrorRate alert fired\"\n    - time: \"14:25\"\n      event: \"IC assigned, confirmed via dashboard\"\n    - time: \"14:28\"\n      event: \"Root cause identified: new ORM query not releasing connections\"\n    - time: \"14:31\"\n      event: \"Rollback initiated: v2.3.1 → v2.3.0\"\n    - time: \"14:35\"\n      event: \"Error rate declining\"\n    - time: \"14:50\"\n      event: \"Resolved: error rate <0.1% sustained\"\n  \n  root_cause: |\n    The v2.3.1 deploy introduced a new database query in the order validation\n    path. The query used a raw connection instead of the pool's managed client,\n    so connections were acquired but never released. Under load, the pool\n    exhausted within 8 minutes.\n  \n  contributing_factors:\n    - \"No integration test for connection pool behavior under load\"\n    - \"Connection pool saturation metric existed but had no alert\"\n    - \"Code review didn't catch raw connection usage\"\n  \n  what_went_well:\n    - \"Alert fired within 8 minutes of deploy\"\n    - \"IC assigned in 2 minutes\"\n    - \"Root cause identified in 3 minutes (clear in logs)\"\n    - \"Rollback executed cleanly\"\n  \n  what_went_wrong:\n    - \"8-minute detection gap after deploy\"\n    - \"No canary deployment to catch before full rollout\"\n    - \"Connection pool saturation had no alert\"\n  \n  action_items:\n    - action: \"Add connection pool saturation alert (>80% for 2 min)\"\n      owner: \"@bob\"\n      priority: P1\n      due: \"2026-02-25\"\n      status: in_progress\n      ticket: \"ENG-1234\"\n    - action: \"Enable canary deployments for payment-api\"\n      owner: \"@alice\"\n      priority: P1\n      due: \"2026-03-01\"\n      ticket: \"ENG-1235\"\n    - action: \"Add linting rule: no raw DB connections in application code\"\n      owner: \"@charlie\"\n      priority: P2\n      due: \"2026-03-07\"\n      ticket: \"ENG-1236\"\n    - action: \"Load test payment-api connection pool in staging\"\n      owner: \"@bob\"\n      priority: P2\n      due: \"2026-03-07\"\n      ticket: \"ENG-1237\"\n  \n  lessons_learned:\n    - \"Resource saturation metrics need alerts, not just dashboards\"\n    - \"Canary deployments are mandatory for Tier 0 services\"\n    - \"ORM abstractions don't guarantee connection safety — review raw queries\""
      },
      {
        "title": "Post-Mortem Meeting Agenda (60 minutes)",
        "body": "1. (5 min) Context setting — IC reads the summary\n2. (15 min) Timeline walkthrough — what happened, when, by whom\n3. (15 min) Root cause deep-dive — 5 Whys exercise\n4. (5 min) What went well — celebrate good response\n5. (15 min) Action items — assign owners, priorities, due dates\n6. (5 min) Wrap-up — review date for action item check-in"
      },
      {
        "title": "5 Whys Exercise",
        "body": "Problem: 5xx errors in payment API\n\nWhy 1: Database connections were exhausted\nWhy 2: A new query acquired connections without releasing them\nWhy 3: The query used a raw connection instead of the pool manager\nWhy 4: The ORM's raw query API doesn't auto-release (by design)\nWhy 5: We don't have a linting rule or code review checklist item for this\n\nRoot cause: Missing guard against raw connection usage in application code\nSystemic fix: Linting rule + connection pool saturation alerting"
      },
      {
        "title": "On-Call Structure",
        "body": "on_call:\n  rotation: weekly\n  handoff_day: Monday 10:00 UTC\n  \n  primary:\n    response_time: 5 minutes (SEV-1/2), 30 minutes (SEV-3)\n    escalation_after: 15 minutes no-ack\n    \n  secondary:\n    response_time: 15 minutes (SEV-1), 1 hour (SEV-2/3)\n    escalation_after: 30 minutes no-ack\n    \n  manager_escalation:\n    trigger: SEV-1 unresolved after 30 minutes\n    \n  handoff_checklist:\n    - Review open incidents and active alerts\n    - Check error budget status for all services\n    - Read post-mortems from previous week\n    - Verify PagerDuty schedule and contact info\n    - Test alert routing (send test page)"
      },
      {
        "title": "On-Call Health Metrics",
        "body": "MetricHealthyNeeds AttentionUnhealthyPages per week<55-15>15After-hours pages per week<22-5>5False positive rate<10%10-30%>30%Mean time to acknowledge<5 min5-15 min>15 minMean time to resolve<30 min30-120 min>120 minToil ratio (manual vs automated)<30%30-60%>60%"
      },
      {
        "title": "Weekly On-Call Review Template",
        "body": "on_call_review:\n  week: \"2026-W08\"\n  engineer: \"@bob\"\n  \n  incidents:\n    total: 7\n    sev_1: 0\n    sev_2: 1\n    sev_3: 4\n    false_positives: 2\n    after_hours: 3\n    \n  time_spent:\n    incident_response: \"4.5 hours\"\n    toil_automation: \"2 hours\"\n    runbook_updates: \"1 hour\"\n    \n  improvements_made:\n    - \"Silenced noisy disk alert on dev servers\"\n    - \"Added auto-remediation for pod restart threshold\"\n    \n  improvements_needed:\n    - \"Cache expiry alert fires every Tuesday at 03:00 — needs investigation\"\n    - \"Payment retry logic needs circuit breaker (caused 3 alerts)\"\n    \n  handoff_notes: |\n    Watch payment-api p99 latency — it's been creeping up since Wednesday.\n    Stripe changed their sandbox endpoints; staging may throw errors."
      },
      {
        "title": "Chaos Principles",
        "body": "Start with a hypothesis: \"If X fails, the system should Y\"\nRun in production (start small — one instance, one AZ)\nMinimize blast radius with automatic rollback\nBuild confidence incrementally: staging → canary → production"
      },
      {
        "title": "Chaos Experiment Template",
        "body": "chaos_experiment:\n  name: \"Payment DB failover\"\n  hypothesis: \"If the primary database becomes unavailable, traffic should\n    failover to the replica within 30 seconds with <1% error rate spike\"\n  \n  steady_state:\n    - metric: \"checkout_success_rate\"\n      expected: \">99.5%\"\n    - metric: \"db_query_duration_p99\"\n      expected: \"<200ms\"\n  \n  injection:\n    type: \"network_partition\"\n    target: \"payment-db-primary\"\n    duration: \"5 minutes\"\n    blast_radius: \"single AZ\"\n  \n  abort_conditions:\n    - \"checkout_success_rate < 95% for > 60 seconds\"\n    - \"revenue_per_minute drops > 50%\"\n    - \"any SEV-1 incident declared\"\n  \n  results:\n    failover_time: \"22 seconds\"\n    error_spike: \"0.3% for 25 seconds\"\n    hypothesis_confirmed: true\n    \n  follow_up_actions:\n    - \"Document failover behavior in runbook\"\n    - \"Add failover time as SLI (target: <30s)\""
      },
      {
        "title": "Chaos Engineering Maturity Levels",
        "body": "LevelWhat You TestTools1: ManualKill a pod, see what happenskubectl delete pod2: AutomatedScheduled pod kills, network delaysChaos Monkey, Litmus3: Game DaysMulti-failure scenarios with team exerciseCustom scripts + coordination4: ContinuousAutomated chaos in production with auto-rollbackGremlin, Chaos Mesh"
      },
      {
        "title": "Cost Drivers (Ranked)",
        "body": "#DriverTypical % of BillOptimization1Log volume40-60%Reduce verbosity, drop DEBUG, sample repetitive2Metric cardinality15-25%Drop unused metrics, limit labels3Trace volume10-20%Sampling, tail-based sampling4Retention10-15%Tiered storage (hot → warm → cold)5Query cost5-10%Optimize dashboard queries, set max scan limits"
      },
      {
        "title": "Cost Reduction Checklist",
        "body": "cost_optimization:\n  logs:\n    - action: \"Drop DEBUG/TRACE in production\"\n      savings: \"30-50% of log volume\"\n    - action: \"Sample health check logs (1:100)\"\n      savings: \"5-15% of log volume\"\n    - action: \"Deduplicate identical error bursts\"\n      savings: \"10-20% during incidents\"\n    - action: \"Move logs older than 7 days to S3/cold storage\"\n      savings: \"60-80% of storage cost\"\n    - action: \"Drop request/response body logging\"\n      savings: \"20-40% of log volume\"\n  \n  metrics:\n    - action: \"Audit unused metrics (no dashboard, no alert)\"\n      savings: \"10-30% of series\"\n    - action: \"Reduce histogram bucket count (default 11 → 8)\"\n      savings: \"~27% of histogram series\"\n    - action: \"Remove high-cardinality labels\"\n      savings: \"Variable — can be massive\"\n    - action: \"Increase scrape interval for non-critical metrics (15s → 60s)\"\n      savings: \"75% of data points for those metrics\"\n  \n  traces:\n    - action: \"Implement tail-based sampling\"\n      savings: \"80-95% of trace volume\"\n    - action: \"Drop internal health check traces\"\n      savings: \"5-20% of trace volume\"\n    - action: \"Reduce span attribute size (truncate long strings)\"\n      savings: \"10-30% of trace storage\"\n  \n  general:\n    - action: \"Review and right-size retention policies quarterly\"\n    - action: \"Set query timeouts and result limits on dashboards\"\n    - action: \"Use recording rules for expensive queries\""
      },
      {
        "title": "Monthly Cost Review Template",
        "body": "observability_cost_review:\n  month: \"February 2026\"\n  total_cost: \"$X,XXX\"\n  \n  breakdown:\n    logs: { volume: \"X TB\", cost: \"$X\", pct: \"X%\" }\n    metrics: { series: \"X million\", cost: \"$X\", pct: \"X%\" }\n    traces: { volume: \"X TB\", cost: \"$X\", pct: \"X%\" }\n    infrastructure: { instances: X, cost: \"$X\", pct: \"X%\" }\n  \n  cost_per:\n    request: \"$0.000X\"\n    service: \"$X average\"\n    engineer: \"$X per engineer\"\n  \n  optimizations_applied: []\n  optimizations_planned: []\n  budget_status: \"on_track | over_budget | under_budget\""
      },
      {
        "title": "Correlation: Connecting the Three Pillars",
        "body": "Every log line includes: trace_id, span_id\nEvery trace span includes: service, operation\nEvery metric includes: service label\n\nCorrelation paths:\n  Alert fires (metric) → Click → Dashboard (metric) → Filter by time window\n    → Trace search (same service + time) → Find failing trace\n    → Logs (filter by trace_id) → See exact error\n    \n  Support ticket (user report) → Find request_id in logs\n    → Extract trace_id → View full trace → Identify slow span\n    → Check span's service metrics → Confirm pattern"
      },
      {
        "title": "Synthetic Monitoring",
        "body": "synthetic_checks:\n  - name: \"Checkout flow\"\n    type: browser\n    frequency: 5m\n    locations: [us-east, eu-west, ap-southeast]\n    steps:\n      - navigate: \"https://app.example.com/products\"\n      - click: \"Add to Cart\"\n      - click: \"Checkout\"\n      - assert: \"Order confirmation page loads in <3s\"\n    alert_on: \"2 consecutive failures from same location\"\n    \n  - name: \"API health\"\n    type: api\n    frequency: 1m\n    endpoints:\n      - url: \"https://api.example.com/health\"\n        expected_status: 200\n        max_latency_ms: 500\n      - url: \"https://api.example.com/v1/products?limit=1\"\n        expected_status: 200\n        max_latency_ms: 1000"
      },
      {
        "title": "Feature Flag Observability",
        "body": "# Correlate feature flags with metrics\nfeature_flag_monitoring:\n  - flag: \"new_checkout_flow\"\n    metrics_to_compare:\n      - \"checkout_conversion_rate\" # by flag variant\n      - \"checkout_error_rate\"\n      - \"checkout_latency_p99\"\n    alerts:\n      - \"If error rate for new variant > 2x control, auto-disable flag\""
      },
      {
        "title": "Observability Maturity Model",
        "body": "DimensionLevel 1Level 2Level 3Level 4LoggingUnstructured logsStructured JSON, centralizedCorrelated with tracesAutomated log analysisMetricsBasic infra metricsRED/USE for servicesSLO-based with error budgetsPredictive (anomaly detection)TracingNo tracingKey services instrumentedFull distributed tracingTrace-driven testingAlertingStatic thresholdsMulti-signal alertsBurn-rate based on SLOsAuto-remediationIncident ResponseAd hocDefined process + rolesPost-mortems with action trackingChaos engineering in prodCulture\"Ops team handles it\"Shared ownership (you build it, you run it)SLO-driven development velocityReliability as a feature"
      },
      {
        "title": "Quality Scoring Rubric (0-100)",
        "body": "DimensionWeight0510Logging quality15%Unstructured, no correlationStructured JSON, missing fieldsFull schema, trace correlation, PII scrubbingMetrics coverage15%No metricsRED or USE, not bothRED + USE + business metrics + customTracing completeness10%No tracingKey servicesFull path, sampling strategy, tail-basedSLO maturity15%No reliability targetsInformal targetsSLOs with error budgets, burn-rate alerts, weekly reviewAlert quality15%Noisy/missingActionable, some runbooksSLO-based, full runbooks, low false positiveIncident response10%Ad hocDefined processFull process, roles, post-mortems, chaos engineeringDashboard design10%No dashboardsBasic panelsHierarchical L1-L4, consistent, linked to alertsCost efficiency10%Unknown costTrackedOptimized, reviewed monthly, within budget\n\n90-100: World-class. Teach others. 70-89: Production-ready. Fill specific gaps. 50-69: Functional but fragile. <50: Significant reliability risk."
      },
      {
        "title": "10 Observability Commandments",
        "body": "Structured or it didn't happen — unstructured logs are technical debt\nCorrelate everything — trace_id connects logs, traces, and metrics\nAlert on symptoms, not causes — users don't care about CPU, they care about latency\nEvery alert gets a runbook — no runbook = no alert\nSLOs drive velocity — error budgets decide when to ship vs stabilize\nDashboards have hierarchy — executives don't need pod CPU graphs\nBlameless post-mortems always — blame prevents learning\nCost is a feature — observability that bankrupts you isn't observability\nYou build it, you run it — the team that ships code owns its observability\nPractice failure — chaos engineering builds confidence"
      },
      {
        "title": "12 Natural Language Commands",
        "body": "CommandWhat It Does\"Audit our observability\"Run the /16 health check, score each dimension, prioritize gaps\"Design logging for [service]\"Generate structured log schema with context fields for the service\"Set up metrics for [service]\"Create RED + USE + business metric instrumentation plan\"Create SLOs for [service]\"Define SLIs, targets, error budgets, and burn-rate alert rules\"Design alerts for [service]\"Create alert rules with severity, thresholds, and runbook templates\"Build dashboard for [service]\"Design L2 service overview dashboard with panel specifications\"Write a runbook for [alert]\"Generate structured runbook with diagnosis steps and fixes\"Run post-mortem for [incident]\"Generate blameless post-mortem document with timeline and action items\"Set up on-call for [team]\"Design rotation, escalation policy, handoff checklist\"Plan chaos experiment for [scenario]\"Design experiment with hypothesis, injection, abort conditions\"Optimize observability costs\"Audit current spend, identify top savings, create reduction plan\"Design tracing for [system]\"Create OpenTelemetry instrumentation plan with sampling strategy"
      },
      {
        "title": "⚡ Level Up Your Observability",
        "body": "This skill gives you the methodology. For industry-specific implementation patterns:\n\nSaaS companies: AfrexAI SaaS Context Pack ($47) — includes SaaS-specific SLOs, multi-tenant monitoring, and usage-based billing observability\nFintech: AfrexAI Fintech Context Pack ($47) — compliance audit logging, transaction monitoring, fraud detection signals\nHealthcare: AfrexAI Healthcare Context Pack ($47) — HIPAA audit trails, PHI access logging, uptime requirements"
      },
      {
        "title": "🔗 More Free Skills by AfrexAI",
        "body": "afrexai-devops-engine — CI/CD, infrastructure, deployment strategies\nafrexai-api-architect — API design, security, versioning\nafrexai-database-engineering — Schema design, query optimization, migrations\nafrexai-code-reviewer — Code review methodology with SPEAR framework\nafrexai-prompt-engineering — System prompt design, testing, optimization\n\nBrowse all AfrexAI skills: clawhub.com | Full storefront"
      }
    ],
    "body": "Observability & Reliability Engineering\n\nComplete system for building observable, reliable services — from structured logging to incident response to SLO-driven development.\n\nQuick Health Check (/16)\n\nScore your current observability posture:\n\nSignal\tHealthy (2)\tWeak (1)\tMissing (0)\nStructured logging\tJSON logs with trace_id correlation\tLogs exist but unstructured\tConsole.log / print statements\nMetrics collection\tRED/USE metrics with dashboards\tSome metrics, no dashboards\tNo metrics\nDistributed tracing\tFull request path with sampling\tPartial traces, key services only\tNo tracing\nAlerting\tSLO-based alerts with runbooks\tThreshold alerts, some runbooks\tNo alerts or all-noise\nIncident response\tDefined process with roles + post-mortems\tAd-hoc response, some docs\t\"Whoever notices fixes it\"\nSLOs defined\tSLOs with error budgets tracked weekly\tInformal availability targets\tNo reliability targets\nOn-call rotation\tStructured rotation with escalation\tInformal \"call someone\"\tNo on-call\nCost management\tObservability budget tracked monthly\tSome awareness of costs\tNo idea what you spend\n\n12-16: Production-grade. Focus on optimization. 8-11: Foundation exists. Fill the gaps systematically. 4-7: Significant risk. Prioritize alerting + incident response. 0-3: Flying blind. Start with Phase 1 immediately.\n\nPhase 1: Structured Logging\nLog Architecture\nApplication → Structured JSON → Log Router → Storage → Query Engine\n                                    ↓\n                              Alert Pipeline\n\nRequired Fields (Every Log Line)\nField\tType\tPurpose\tExample\ntimestamp\tISO-8601 UTC\tWhen\t2026-02-22T18:30:00.123Z\nlevel\tenum\tSeverity\tinfo, warn, error, fatal\nservice\tstring\tWhich service\tpayment-api\nversion\tstring\tWhich deploy\tv2.3.1\nenvironment\tstring\tWhich env\tproduction\nmessage\tstring\tWhat happened\tPayment processed successfully\ntrace_id\tstring\tRequest correlation\tabc123def456\nspan_id\tstring\tOperation within trace\tspan_789\nduration_ms\tnumber\tHow long\t142\nContextual Fields (Add Per Domain)\n# HTTP request context\nhttp:\n  method: POST\n  path: /api/v1/orders\n  status: 201\n  client_ip: 203.0.113.42  # Anonymize in logs if needed\n  user_agent: \"Mozilla/5.0...\"\n  request_id: \"req_abc123\"\n\n# Business context\nbusiness:\n  user_id: \"usr_456\"\n  tenant_id: \"tenant_789\"\n  order_id: \"ord_012\"\n  action: \"checkout\"\n  amount_cents: 4999\n  currency: \"USD\"\n\n# Error context\nerror:\n  type: \"PaymentDeclinedError\"\n  message: \"Card declined: insufficient funds\"\n  code: \"CARD_DECLINED\"\n  stack: \"...\" # Only in non-production or DEBUG level\n  retry_count: 2\n  retryable: true\n\nLog Level Decision Tree\nIs the process about to crash?\n  → FATAL (exit after logging)\n\nDid an operation fail that needs human attention?\n  → ERROR (page someone or create ticket)\n\nDid something unexpected happen but we recovered?\n  → WARN (review in daily triage)\n\nIs this a normal business event worth recording?\n  → INFO (audit trail, business metrics)\n\nIs this useful for debugging but noisy in production?\n  → DEBUG (off in prod, on in staging)\n\nIs this only useful when stepping through code?\n  → TRACE (never in production)\n\nLog Level Rules\nERROR means action required — if no one needs to act on it, it's WARN\nINFO is for business events — not internal implementation details\nNo logging inside tight loops — aggregate and log summary\nLog at boundaries — API entry/exit, queue consume/publish, DB calls\nNever log secrets — API keys, tokens, passwords, PII (see scrubbing below)\nPII & Secret Scrubbing\nscrub_patterns:\n  # Always redact\n  - field_patterns: [\"password\", \"secret\", \"token\", \"api_key\", \"authorization\"]\n    action: replace_with_redacted\n  \n  # Hash for correlation without exposure\n  - field_patterns: [\"email\", \"phone\", \"ssn\", \"national_id\"]\n    action: sha256_hash\n  \n  # Mask partially\n  - field_patterns: [\"credit_card\", \"card_number\"]\n    action: mask_last_4  # \"****-****-****-1234\"\n  \n  # IP anonymization\n  - field_patterns: [\"client_ip\", \"ip_address\"]\n    action: zero_last_octet  # 203.0.113.0\n\nLogger Setup (By Language)\n\nNode.js (Pino):\n\nimport pino from 'pino';\nimport { AsyncLocalStorage } from 'node:async_hooks';\n\nconst als = new AsyncLocalStorage<Record<string, string>>();\n\nconst logger = pino({\n  level: process.env.LOG_LEVEL || 'info',\n  formatters: {\n    level: (label) => ({ level: label }),\n  },\n  mixin: () => als.getStore() ?? {},\n  redact: ['req.headers.authorization', '*.password', '*.token'],\n  timestamp: pino.stdTimeFunctions.isoTime,\n});\n\n// Middleware: inject context\napp.use((req, res, next) => {\n  const ctx = {\n    trace_id: req.headers['x-trace-id'] || crypto.randomUUID(),\n    request_id: crypto.randomUUID(),\n    service: 'payment-api',\n    version: process.env.APP_VERSION,\n  };\n  als.run(ctx, () => next());\n});\n\n\nPython (structlog):\n\nimport structlog\nstructlog.configure(\n    processors=[\n        structlog.contextvars.merge_contextvars,\n        structlog.processors.add_log_level,\n        structlog.processors.TimeStamper(fmt=\"iso\", utc=True),\n        structlog.processors.JSONRenderer(),\n    ],\n)\nlog = structlog.get_logger()\n# Bind context per-request:\nstructlog.contextvars.bind_contextvars(trace_id=trace_id, user_id=user_id)\n\n\nGo (zerolog):\n\nlog := zerolog.New(os.Stdout).With().\n    Timestamp().\n    Str(\"service\", \"payment-api\").\n    Str(\"version\", version).\n    Logger()\n// Per-request:\nreqLog := log.With().Str(\"trace_id\", traceID).Logger()\n\nLog Storage Decision\nVolume\tSolution\tRetention\tCost\n<10 GB/day\tLoki + Grafana\t30 days hot, 90 days cold\tLow\n10-100 GB/day\tElasticsearch / OpenSearch\t14 days hot, 90 days S3\tMedium\n100+ GB/day\tClickHouse or Datadog\t7 days hot, 30 days archive\tHigh\nBudget-constrained\tLoki + S3 backend\t90 days all cold\tVery low\n10 Logging Anti-Patterns\n#\tAnti-Pattern\tFix\n1\tlog.error(err) with no context\tAlways include: what operation, what input, what state\n2\tLogging request/response bodies\tLog only in DEBUG; redact sensitive fields\n3\tString concatenation in log messages\tUse structured fields: log.info(\"processed\", { order_id, amount })\n4\tCatch-and-log-and-rethrow\tLog at the boundary where you handle it, not every layer\n5\tDifferent log formats per service\tStandardize schema across all services\n6\tNo log rotation / retention policy\tSet max size + TTL; archive to cold storage\n7\tLogging inside hot paths\tAggregate: log summary every N items or every interval\n8\tMissing correlation IDs\tPropagate trace_id from first entry point through all services\n9\tBoolean log levels (verbose: true)\tUse standard levels with configurable minimum\n10\tLogging PII in plain text\tImplement scrubbing at the logger level\nPhase 2: Metrics Collection\nThe RED Method (Request-Driven Services)\n\nFor every service endpoint, track:\n\nMetric\tWhat\tPrometheus Example\nRate\tRequests per second\thttp_requests_total{method, path, status}\nErrors\tFailed requests per second\thttp_requests_total{status=~\"5..\"} / total\nDuration\tLatency distribution\thttp_request_duration_seconds{method, path} (histogram)\nThe USE Method (Infrastructure Resources)\n\nFor every resource (CPU, memory, disk, network):\n\nMetric\tWhat\tExample\nUtilization\t% resource busy\tCPU usage 78%\nSaturation\tQueue depth / backpressure\t12 requests queued\nErrors\tResource errors\t3 disk I/O errors\nGolden Signals (Google SRE)\nSignal\tMeaning\tSource\nLatency\tTime to serve requests\tRED Duration\nTraffic\tDemand on the system\tRED Rate\nErrors\tRate of failed requests\tRED Errors\nSaturation\tHow \"full\" the service is\tUSE Saturation\nMetric Types & When to Use Each\nType\tUse Case\tExample\nCounter\tThings that only go up\tTotal requests, errors, bytes sent\nGauge\tCurrent value that goes up/down\tActive connections, queue depth, temperature\nHistogram\tDistribution of values\tRequest latency, response size\nSummary\tPre-calculated percentiles\tClient-side latency (when you need exact percentiles)\n\nRule: Use histograms over summaries in most cases — they're aggregatable across instances.\n\nNaming Conventions\n# Pattern: <namespace>_<subsystem>_<name>_<unit>\nhttp_server_request_duration_seconds\nhttp_server_requests_total\ndb_pool_connections_active\nqueue_messages_pending\ncache_hit_ratio\n\n# Rules:\n# 1. Use snake_case\n# 2. Include unit suffix (_seconds, _bytes, _total)\n# 3. _total suffix for counters\n# 4. Don't include label names in metric name\n# 5. Use base units (seconds not milliseconds, bytes not kilobytes)\n\nLabel Design Rules\nRule\tWhy\tExample\nKeep cardinality <100 per label\tHigh cardinality kills performance\tstatus=\"200\" not status=\"200 OK\"\nNo user IDs as labels\tUnbounded cardinality\tUse log correlation instead\nNo request paths with IDs\t/api/users/123 creates millions of series\tNormalize: /api/users/:id\nMax 5-7 labels per metric\tEach combo = a time series\t{method, path, status, service}\nInstrumentation Checklist\napplication_metrics:\n  # HTTP layer\n  - http_request_duration_seconds: histogram {method, path, status}\n  - http_request_size_bytes: histogram {method, path}\n  - http_response_size_bytes: histogram {method, path}\n  - http_requests_in_flight: gauge\n  \n  # Business logic\n  - orders_processed_total: counter {status, payment_method}\n  - order_value_dollars: histogram {payment_method}\n  - user_signups_total: counter {source}\n  \n  # Dependencies\n  - db_query_duration_seconds: histogram {query_type, table}\n  - db_connections_active: gauge {pool}\n  - db_connections_idle: gauge {pool}\n  - cache_requests_total: counter {result: hit|miss}\n  - external_api_duration_seconds: histogram {service, endpoint}\n  - external_api_errors_total: counter {service, error_type}\n  \n  # Queue / async\n  - queue_messages_published_total: counter {queue}\n  - queue_messages_consumed_total: counter {queue, status}\n  - queue_processing_duration_seconds: histogram {queue}\n  - queue_depth: gauge {queue}\n  - queue_consumer_lag: gauge {queue, consumer_group}\n\ninfrastructure_metrics:\n  # Node exporter / cAdvisor provides these automatically\n  - cpu_usage_percent: gauge {instance}\n  - memory_usage_bytes: gauge {instance}\n  - disk_usage_bytes: gauge {instance, mount}\n  - disk_io_seconds: counter {instance, device}\n  - network_bytes: counter {instance, direction}\n  - container_cpu_usage: gauge {pod, container}\n  - container_memory_usage: gauge {pod, container}\n\nStack Recommendations\nComponent\tOptions\tRecommendation\nCollection\tPrometheus, OTEL Collector, Datadog Agent\tPrometheus (free) or OTEL Collector (vendor-neutral)\nStorage\tPrometheus, Thanos, Mimir, VictoriaMetrics\tVictoriaMetrics (best cost/perf) or Mimir (Grafana ecosystem)\nVisualization\tGrafana, Datadog, New Relic\tGrafana (free, extensible)\nAlerting\tAlertmanager, Grafana Alerting, PagerDuty\tAlertmanager + PagerDuty routing\nPhase 3: Distributed Tracing\nTrace Architecture\nClient Request\n  → API Gateway (root span)\n    → Auth Service (child span)\n    → Order Service (child span)\n      → Database Query (child span)\n      → Payment Service (child span)\n        → Stripe API (child span)\n    → Notification Service (child span)\n      → Email Provider (child span)\n\nOpenTelemetry Setup\n\nAuto-instrumentation (Node.js):\n\n// tracing.ts — import BEFORE anything else\nimport { NodeSDK } from '@opentelemetry/sdk-node';\nimport { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';\nimport { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';\n\nconst sdk = new NodeSDK({\n  traceExporter: new OTLPTraceExporter({\n    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',\n  }),\n  instrumentations: [getNodeAutoInstrumentations({\n    '@opentelemetry/instrumentation-http': { ignoreIncomingPaths: ['/health', '/ready'] },\n    '@opentelemetry/instrumentation-express': { enabled: true },\n  })],\n  serviceName: process.env.OTEL_SERVICE_NAME || 'payment-api',\n});\nsdk.start();\n\n\nCustom spans for business logic:\n\nimport { trace, SpanStatusCode } from '@opentelemetry/api';\n\nconst tracer = trace.getTracer('payment-service');\n\nasync function processPayment(order: Order) {\n  return tracer.startActiveSpan('process-payment', async (span) => {\n    span.setAttributes({\n      'order.id': order.id,\n      'order.amount_cents': order.amountCents,\n      'payment.method': order.paymentMethod,\n    });\n    try {\n      const result = await chargeCard(order);\n      span.setAttributes({ 'payment.status': result.status });\n      return result;\n    } catch (err) {\n      span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });\n      span.recordException(err);\n      throw err;\n    } finally {\n      span.end();\n    }\n  });\n}\n\nSampling Strategies\nStrategy\tWhen\tConfig\nAlways On\tDev/staging, low traffic (<100 rps)\tratio: 1.0\nProbabilistic\tModerate traffic (100-1000 rps)\tratio: 0.1 (10%)\nRate-limited\tHigh traffic (>1000 rps)\tmax_traces_per_second: 100\nTail-based\tWant all errors + slow requests\tCollector-side: keep if error OR duration > p99\nParent-based\tRespect upstream decisions\tIf parent sampled, child sampled\n\nRecommendation: Start with parent-based + probabilistic (10%). Add tail-based at the collector to capture all errors.\n\nContext Propagation\nHeader\tStandard\tFormat\ntraceparent\tW3C Trace Context\t00-{trace_id}-{span_id}-{flags}\ntracestate\tW3C Trace Context\tVendor-specific key-value pairs\nb3\tZipkin B3\t{trace_id}-{span_id}-{sampled}\n\nRule: Use W3C Trace Context (traceparent) as primary. Support B3 for legacy Zipkin systems.\n\nTrace Storage\nVolume\tSolution\tRetention\n<50 GB/day\tJaeger + Elasticsearch\t7 days\n50-500 GB/day\tTempo + S3\t14 days\n500+ GB/day\tTempo + S3 with aggressive sampling\t7 days\nBudget-constrained\tJaeger + Badger (local disk)\t3 days\nPhase 4: SLOs, SLIs & Error Budgets\nSLI Selection by Service Type\nService Type\tPrimary SLI\tSecondary SLI\tMeasurement\nAPI / Web\tAvailability + Latency\tError rate\tServer-side + synthetic\nData pipeline\tFreshness + Correctness\tThroughput\tPipeline timestamps + checksums\nStorage\tDurability + Availability\tLatency\tChecksums + uptime monitoring\nStreaming\tThroughput + Latency\tMessage loss rate\tConsumer lag + e2e latency\nBatch jobs\tSuccess rate + Freshness\tDuration\tJob scheduler metrics\nSLO Definition Template\nslo:\n  name: \"Payment API Availability\"\n  service: payment-api\n  owner: payments-team\n  \n  sli:\n    type: availability\n    definition: \"Proportion of non-5xx responses\"\n    measurement: |\n      sum(rate(http_requests_total{service=\"payment-api\",status!~\"5..\"}[5m]))\n      /\n      sum(rate(http_requests_total{service=\"payment-api\"}[5m]))\n    \n  target: 99.95%  # 21.9 min downtime/month\n  window: rolling_30d\n  \n  error_budget:\n    total_minutes: 21.9  # per 30 days\n    burn_rate_alerts:\n      - severity: critical\n        burn_rate: 14.4x  # Budget consumed in 2 hours\n        short_window: 5m\n        long_window: 1h\n      - severity: warning\n        burn_rate: 6x    # Budget consumed in 5 days\n        short_window: 30m\n        long_window: 6h\n      - severity: ticket\n        burn_rate: 1x    # Budget consumed in 30 days\n        short_window: 6h\n        long_window: 3d\n  \n  consequences:\n    budget_remaining_above_50pct: \"Normal development velocity\"\n    budget_remaining_20_to_50pct: \"Prioritize reliability work\"\n    budget_remaining_below_20pct: \"Feature freeze; reliability only\"\n    budget_exhausted: \"All hands on reliability until budget recovers\"\n\nCommon SLO Targets\nService Tier\tAvailability\tp50 Latency\tp99 Latency\tMonthly Downtime\nTier 0 (payments, auth)\t99.99%\t<100ms\t<500ms\t4.3 min\nTier 1 (core API)\t99.95%\t<200ms\t<1s\t21.9 min\nTier 2 (non-critical)\t99.9%\t<500ms\t<2s\t43.8 min\nTier 3 (internal tools)\t99.5%\t<1s\t<5s\t3.6 hours\nBatch / pipeline\t99% (success rate)\tN/A\tN/A\tN/A\nError Budget Tracking\n# Weekly error budget review template\nerror_budget_review:\n  week: \"2026-W08\"\n  service: payment-api\n  slo_target: 99.95%\n  \n  budget:\n    total_minutes_this_period: 21.9\n    consumed_minutes: 8.2\n    remaining_minutes: 13.7\n    remaining_percent: 62.6%\n    \n  incidents_consuming_budget:\n    - date: \"2026-02-18\"\n      duration_minutes: 5.1\n      cause: \"Database connection pool exhaustion\"\n      preventable: true\n      action: \"Increase pool size + add saturation alert\"\n    - date: \"2026-02-20\"\n      duration_minutes: 3.1\n      cause: \"Upstream payment provider timeout\"\n      preventable: false\n      action: \"Add circuit breaker with fallback\"\n  \n  velocity_decision: \"Normal — 62.6% budget remaining\"\n  reliability_work_this_week:\n    - \"Add connection pool saturation alert\"\n    - \"Implement circuit breaker for payment provider\"\n\nPhase 5: Alert Design\nAlert Quality Principles\nEvery alert must be actionable — if no one needs to act, it's not an alert\nEvery alert needs a runbook — linked directly in the alert annotation\nSymptom-based over cause-based — alert on \"users can't checkout\" not \"CPU high\"\nMulti-window burn rate — not static thresholds (see SLO alerts above)\nAlert on absence, not just presence — \"no orders in 15 min\" catches silent failures\nAlert Severity Levels\nSeverity\tResponse Time\tChannel\tWho\tExample\nP0 — Critical\t<5 min\tPage (PagerDuty/Opsgenie)\tOn-call engineer\tPayment system down\nP1 — High\t<30 min\tPage during business hours, Slack 24/7\tOn-call\tError rate >5% for 10 min\nP2 — Medium\t<4 hours\tSlack channel\tTeam\tp99 latency degraded 2x\nP3 — Low\tNext business day\tTicket auto-created\tTeam backlog\tDisk usage >80%\nInfo\tN/A\tDashboard only\tNo one\tDeploy completed\nAlerting Anti-Patterns\nAnti-Pattern\tProblem\tFix\nStatic CPU/memory thresholds\tNoisy, not user-impacting\tUse SLO-based burn rate alerts\nAlert per instance\t50 instances = 50 alerts for same issue\tAggregate: alert on service-level error rate\nNo deduplication\tSame alert fires 100 times\tGroup by service + alert name; set repeat interval\nMissing runbook\tEngineer gets paged, doesn't know what to do\tEvery alert links to a runbook\nThreshold too sensitive\tFires on brief spikes\tUse for: 5m to require sustained condition\nToo many P0s\tAlert fatigue → ignoring real incidents\tAudit monthly; demote or remove noisy alerts\nAlert Template (Prometheus Alertmanager)\ngroups:\n  - name: payment-api-slo\n    rules:\n      - alert: PaymentAPIHighErrorRate\n        expr: |\n          (\n            sum(rate(http_requests_total{service=\"payment-api\",status=~\"5..\"}[5m]))\n            /\n            sum(rate(http_requests_total{service=\"payment-api\"}[5m]))\n          ) > 0.01\n        for: 5m\n        labels:\n          severity: critical\n          service: payment-api\n          team: payments\n        annotations:\n          summary: \"Payment API error rate {{ $value | humanizePercentage }} (>1%)\"\n          description: \"5xx error rate has exceeded 1% for 5 minutes\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-errors\"\n          dashboard: \"https://grafana.internal/d/payment-api\"\n          \n      - alert: PaymentAPINoTraffic\n        expr: |\n          sum(rate(http_requests_total{service=\"payment-api\"}[15m])) == 0\n        for: 5m\n        labels:\n          severity: critical\n          service: payment-api\n        annotations:\n          summary: \"Payment API receiving zero traffic for 5 minutes\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-no-traffic\"\n\n      - alert: PaymentAPILatencyHigh\n        expr: |\n          histogram_quantile(0.99, \n            sum(rate(http_request_duration_seconds_bucket{service=\"payment-api\"}[5m])) by (le)\n          ) > 2\n        for: 10m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"Payment API p99 latency {{ $value }}s (>2s for 10min)\"\n          runbook: \"https://wiki.internal/runbooks/payment-api-latency\"\n\nRunbook Template\n# Runbook: PaymentAPIHighErrorRate\n\n## What This Alert Means\nThe payment API is returning >1% 5xx errors over a 5-minute window.\nUsers are likely failing to complete checkouts.\n\n## Impact\n- Users cannot process payments\n- Revenue loss: ~$X per minute (based on average traffic)\n- SLO: Payment API availability (target: 99.95%)\n\n## Immediate Actions\n1. Check the error dashboard: [link]\n2. Check recent deploys: `kubectl rollout history deployment/payment-api`\n3. Check upstream dependencies:\n   - Database: [dashboard link]\n   - Stripe API: [status page]\n   - Redis cache: [dashboard link]\n4. Check application logs:\n\n\nkubectl logs -l app=payment-api --since=10m | jq 'select(.level==\"error\")'\n\n\n## Common Causes & Fixes\n| Cause | Diagnosis | Fix |\n|-------|-----------|-----|\n| Bad deploy | Errors started at deploy time | `kubectl rollout undo deployment/payment-api` |\n| DB connection exhaustion | `db_connections_active` at max | Restart pods (rolling) + increase pool size |\n| Stripe outage | Stripe status page red | Enable fallback payment processor |\n| Memory leak | Memory climbing, OOMKilled events | Rolling restart + investigate |\n\n## Escalation\n- If unresolved after 15 min: page payment team lead\n- If revenue impact >$10K: page VP Engineering\n- If Stripe outage: communicate to support team for customer messaging\n\n## Resolution\n- Confirm error rate <0.1% for 10 min\n- Post in #incidents: root cause + duration + impact\n- Schedule post-mortem if downtime >5 min\n\nPhase 6: Dashboard Architecture\nDashboard Hierarchy\nL1: Executive / Business Dashboard (non-technical stakeholders)\n  ↓\nL2: Service Overview Dashboard (on-call, quick triage)\n  ↓\nL3: Service Deep-Dive Dashboard (debugging specific service)\n  ↓\nL4: Infrastructure Dashboard (resource-level details)\n\nL1: Business Dashboard\npanels:\n  - title: \"Revenue per Minute\"\n    type: stat\n    query: \"sum(rate(orders_total{status='completed'}[5m])) * avg(order_value_dollars)\"\n  - title: \"Active Users (5min)\"\n    type: stat\n    query: \"count(count by (user_id) (http_requests_total{...}[5m]))\"\n  - title: \"Checkout Success Rate\"\n    type: gauge\n    query: \"sum(rate(checkout_total{status='success'}[1h])) / sum(rate(checkout_total[1h]))\"\n    thresholds: [95, 98, 99.5]\n  - title: \"Error Budget Remaining\"\n    type: gauge\n    query: \"1 - (error_budget_consumed / error_budget_total)\"\n\nL2: Service Overview Dashboard\n\nEvery service gets one of these with identical layout:\n\nrow_1_traffic:\n  - \"Request Rate (rps)\" — timeseries, by status code\n  - \"Error Rate (%)\" — timeseries, threshold line at SLO\n  - \"Active Requests\" — gauge\n\nrow_2_latency:\n  - \"Latency Distribution\" — heatmap\n  - \"p50 / p95 / p99\" — timeseries, threshold lines\n  - \"Latency by Endpoint\" — table, sorted by p99\n\nrow_3_dependencies:\n  - \"Downstream Latency\" — timeseries per dependency\n  - \"Downstream Error Rate\" — timeseries per dependency\n  - \"Database Query Duration\" — timeseries by query type\n\nrow_4_resources:\n  - \"CPU Usage\" — timeseries per pod\n  - \"Memory Usage\" — timeseries per pod\n  - \"Pod Restarts\" — stat\n\nrow_5_business:\n  - \"Business Metric 1\" — service-specific\n  - \"Business Metric 2\" — service-specific\n\nDashboard Rules\nTime range default: last 1 hour — most debugging happens in recent time\nVariable selectors at top: environment, service, instance\nConsistent color coding: green=good, yellow=degraded, red=bad across all dashboards\nLink alerts to dashboards — every alert annotation includes dashboard URL\nNo more than 15 panels per dashboard — split into L3 if needed\nInclude \"as of\" timestamp — so screenshots in incidents are unambiguous\nDashboard as code — store Grafana JSON in git, provision via API\nPhase 7: Incident Response\nIncident Severity Classification\nSeverity\tCriteria\tResponse\tCommunication\nSEV-1\tService down, data loss risk, security breach\tAll hands, war room\tStatus page update every 15 min\nSEV-2\tDegraded service, SLO at risk, partial outage\tOn-call + backup\tStatus page update every 30 min\nSEV-3\tMinor degradation, workaround exists\tOn-call during hours\tInternal Slack update\nSEV-4\tCosmetic, low impact\tNext sprint\tNone\nIncident Roles\nRole\tResponsibility\tWho\nIncident Commander (IC)\tOwns the incident. Coordinates. Makes decisions.\tOn-call lead\nTechnical Lead\tDiagnoses and fixes. Communicates technical status to IC.\tSenior engineer\nCommunications Lead\tUpdates status page, Slack, stakeholders.\tProduct/support\nScribe\tDocuments timeline, actions, decisions in real-time.\tAnyone available\nIncident Response Workflow\n1. DETECT\n   - Alert fires → on-call paged\n   - Customer report → support escalates\n   - Internal discovery → engineer reports\n   \n2. TRIAGE (first 5 minutes)\n   - Confirm the issue is real (not false alert)\n   - Classify severity (SEV-1 through SEV-4)\n   - Open incident channel: #inc-YYYY-MM-DD-short-description\n   - Assign roles (IC, Tech Lead, Comms)\n   \n3. MITIGATE (next 5-30 minutes)\n   - Goal: STOP THE BLEEDING, not find root cause\n   - Options (try in order):\n     a. Rollback last deploy\n     b. Scale up / restart pods\n     c. Toggle feature flag off\n     d. Redirect traffic / enable fallback\n     e. Manual data fix\n   - Document every action with timestamp\n   \n4. STABILIZE\n   - Confirm mitigation is working (metrics back to normal)\n   - Monitor for 15-30 min for recurrence\n   - Update status page: \"Monitoring fix\"\n   \n5. RESOLVE\n   - Confirm all metrics healthy for 30+ min\n   - Update status page: \"Resolved\"\n   - Schedule post-mortem (within 48 hours for SEV-1/2)\n   - Send internal summary to stakeholders\n\nIncident Channel Template\n📋 Incident: Payment API 5xx Errors\n🔴 Severity: SEV-2\n🕐 Started: 2026-02-22 14:23 UTC\n👤 IC: @alice\n🔧 Tech Lead: @bob\n📢 Comms: @charlie\n\nStatus: MITIGATING\nImpact: ~5% of checkout requests failing\nCustomer-facing: Yes\n\nTimeline:\n14:23 — Alert fired: PaymentAPIHighErrorRate\n14:25 — IC assigned: @alice, confirmed real via dashboard\n14:28 — Tech Lead: error logs show connection pool exhaustion post-deploy\n14:31 — Rolled back deployment v2.3.1 → v2.3.0\n14:35 — Error rate dropping, monitoring\n14:50 — Error rate <0.1%, marking resolved\n\nPhase 8: Post-Mortem Framework\nBlameless Post-Mortem Template\npost_mortem:\n  title: \"Payment API Connection Pool Exhaustion\"\n  date: \"2026-02-22\"\n  severity: SEV-2\n  duration: 27 minutes (14:23 — 14:50 UTC)\n  authors: [\"@alice\", \"@bob\"]\n  reviewers: [\"@engineering-leads\"]\n  status: action_items_in_progress\n  \n  summary: |\n    A deployment at 14:15 introduced a connection leak in the payment API.\n    Connection pool was exhausted by 14:23, causing 5xx errors for ~5% of\n    checkout requests. Rolled back at 14:31; recovered by 14:50.\n  \n  impact:\n    user_impact: \"~340 users saw checkout failures over 27 minutes\"\n    revenue_impact: \"$2,100 estimated (based on average order value × failed checkouts)\"\n    slo_impact: \"Consumed 5.1 min of 21.9 min monthly error budget (23%)\"\n    data_impact: \"No data loss. 12 orders failed; users could retry successfully.\"\n  \n  timeline:\n    - time: \"14:15\"\n      event: \"Deploy v2.3.1 rolled out (3/3 pods updated)\"\n    - time: \"14:23\"\n      event: \"PaymentAPIHighErrorRate alert fired\"\n    - time: \"14:25\"\n      event: \"IC assigned, confirmed via dashboard\"\n    - time: \"14:28\"\n      event: \"Root cause identified: new ORM query not releasing connections\"\n    - time: \"14:31\"\n      event: \"Rollback initiated: v2.3.1 → v2.3.0\"\n    - time: \"14:35\"\n      event: \"Error rate declining\"\n    - time: \"14:50\"\n      event: \"Resolved: error rate <0.1% sustained\"\n  \n  root_cause: |\n    The v2.3.1 deploy introduced a new database query in the order validation\n    path. The query used a raw connection instead of the pool's managed client,\n    so connections were acquired but never released. Under load, the pool\n    exhausted within 8 minutes.\n  \n  contributing_factors:\n    - \"No integration test for connection pool behavior under load\"\n    - \"Connection pool saturation metric existed but had no alert\"\n    - \"Code review didn't catch raw connection usage\"\n  \n  what_went_well:\n    - \"Alert fired within 8 minutes of deploy\"\n    - \"IC assigned in 2 minutes\"\n    - \"Root cause identified in 3 minutes (clear in logs)\"\n    - \"Rollback executed cleanly\"\n  \n  what_went_wrong:\n    - \"8-minute detection gap after deploy\"\n    - \"No canary deployment to catch before full rollout\"\n    - \"Connection pool saturation had no alert\"\n  \n  action_items:\n    - action: \"Add connection pool saturation alert (>80% for 2 min)\"\n      owner: \"@bob\"\n      priority: P1\n      due: \"2026-02-25\"\n      status: in_progress\n      ticket: \"ENG-1234\"\n    - action: \"Enable canary deployments for payment-api\"\n      owner: \"@alice\"\n      priority: P1\n      due: \"2026-03-01\"\n      ticket: \"ENG-1235\"\n    - action: \"Add linting rule: no raw DB connections in application code\"\n      owner: \"@charlie\"\n      priority: P2\n      due: \"2026-03-07\"\n      ticket: \"ENG-1236\"\n    - action: \"Load test payment-api connection pool in staging\"\n      owner: \"@bob\"\n      priority: P2\n      due: \"2026-03-07\"\n      ticket: \"ENG-1237\"\n  \n  lessons_learned:\n    - \"Resource saturation metrics need alerts, not just dashboards\"\n    - \"Canary deployments are mandatory for Tier 0 services\"\n    - \"ORM abstractions don't guarantee connection safety — review raw queries\"\n\nPost-Mortem Meeting Agenda (60 minutes)\n1. (5 min) Context setting — IC reads the summary\n2. (15 min) Timeline walkthrough — what happened, when, by whom\n3. (15 min) Root cause deep-dive — 5 Whys exercise\n4. (5 min) What went well — celebrate good response\n5. (15 min) Action items — assign owners, priorities, due dates\n6. (5 min) Wrap-up — review date for action item check-in\n\n5 Whys Exercise\nProblem: 5xx errors in payment API\n\nWhy 1: Database connections were exhausted\nWhy 2: A new query acquired connections without releasing them\nWhy 3: The query used a raw connection instead of the pool manager\nWhy 4: The ORM's raw query API doesn't auto-release (by design)\nWhy 5: We don't have a linting rule or code review checklist item for this\n\nRoot cause: Missing guard against raw connection usage in application code\nSystemic fix: Linting rule + connection pool saturation alerting\n\nPhase 9: On-Call Operations\nOn-Call Structure\non_call:\n  rotation: weekly\n  handoff_day: Monday 10:00 UTC\n  \n  primary:\n    response_time: 5 minutes (SEV-1/2), 30 minutes (SEV-3)\n    escalation_after: 15 minutes no-ack\n    \n  secondary:\n    response_time: 15 minutes (SEV-1), 1 hour (SEV-2/3)\n    escalation_after: 30 minutes no-ack\n    \n  manager_escalation:\n    trigger: SEV-1 unresolved after 30 minutes\n    \n  handoff_checklist:\n    - Review open incidents and active alerts\n    - Check error budget status for all services\n    - Read post-mortems from previous week\n    - Verify PagerDuty schedule and contact info\n    - Test alert routing (send test page)\n\nOn-Call Health Metrics\nMetric\tHealthy\tNeeds Attention\tUnhealthy\nPages per week\t<5\t5-15\t>15\nAfter-hours pages per week\t<2\t2-5\t>5\nFalse positive rate\t<10%\t10-30%\t>30%\nMean time to acknowledge\t<5 min\t5-15 min\t>15 min\nMean time to resolve\t<30 min\t30-120 min\t>120 min\nToil ratio (manual vs automated)\t<30%\t30-60%\t>60%\nWeekly On-Call Review Template\non_call_review:\n  week: \"2026-W08\"\n  engineer: \"@bob\"\n  \n  incidents:\n    total: 7\n    sev_1: 0\n    sev_2: 1\n    sev_3: 4\n    false_positives: 2\n    after_hours: 3\n    \n  time_spent:\n    incident_response: \"4.5 hours\"\n    toil_automation: \"2 hours\"\n    runbook_updates: \"1 hour\"\n    \n  improvements_made:\n    - \"Silenced noisy disk alert on dev servers\"\n    - \"Added auto-remediation for pod restart threshold\"\n    \n  improvements_needed:\n    - \"Cache expiry alert fires every Tuesday at 03:00 — needs investigation\"\n    - \"Payment retry logic needs circuit breaker (caused 3 alerts)\"\n    \n  handoff_notes: |\n    Watch payment-api p99 latency — it's been creeping up since Wednesday.\n    Stripe changed their sandbox endpoints; staging may throw errors.\n\nPhase 10: Chaos Engineering & Reliability Testing\nChaos Principles\nStart with a hypothesis: \"If X fails, the system should Y\"\nRun in production (start small — one instance, one AZ)\nMinimize blast radius with automatic rollback\nBuild confidence incrementally: staging → canary → production\nChaos Experiment Template\nchaos_experiment:\n  name: \"Payment DB failover\"\n  hypothesis: \"If the primary database becomes unavailable, traffic should\n    failover to the replica within 30 seconds with <1% error rate spike\"\n  \n  steady_state:\n    - metric: \"checkout_success_rate\"\n      expected: \">99.5%\"\n    - metric: \"db_query_duration_p99\"\n      expected: \"<200ms\"\n  \n  injection:\n    type: \"network_partition\"\n    target: \"payment-db-primary\"\n    duration: \"5 minutes\"\n    blast_radius: \"single AZ\"\n  \n  abort_conditions:\n    - \"checkout_success_rate < 95% for > 60 seconds\"\n    - \"revenue_per_minute drops > 50%\"\n    - \"any SEV-1 incident declared\"\n  \n  results:\n    failover_time: \"22 seconds\"\n    error_spike: \"0.3% for 25 seconds\"\n    hypothesis_confirmed: true\n    \n  follow_up_actions:\n    - \"Document failover behavior in runbook\"\n    - \"Add failover time as SLI (target: <30s)\"\n\nChaos Engineering Maturity Levels\nLevel\tWhat You Test\tTools\n1: Manual\tKill a pod, see what happens\tkubectl delete pod\n2: Automated\tScheduled pod kills, network delays\tChaos Monkey, Litmus\n3: Game Days\tMulti-failure scenarios with team exercise\tCustom scripts + coordination\n4: Continuous\tAutomated chaos in production with auto-rollback\tGremlin, Chaos Mesh\nPhase 11: Observability Cost Optimization\nCost Drivers (Ranked)\n#\tDriver\tTypical % of Bill\tOptimization\n1\tLog volume\t40-60%\tReduce verbosity, drop DEBUG, sample repetitive\n2\tMetric cardinality\t15-25%\tDrop unused metrics, limit labels\n3\tTrace volume\t10-20%\tSampling, tail-based sampling\n4\tRetention\t10-15%\tTiered storage (hot → warm → cold)\n5\tQuery cost\t5-10%\tOptimize dashboard queries, set max scan limits\nCost Reduction Checklist\ncost_optimization:\n  logs:\n    - action: \"Drop DEBUG/TRACE in production\"\n      savings: \"30-50% of log volume\"\n    - action: \"Sample health check logs (1:100)\"\n      savings: \"5-15% of log volume\"\n    - action: \"Deduplicate identical error bursts\"\n      savings: \"10-20% during incidents\"\n    - action: \"Move logs older than 7 days to S3/cold storage\"\n      savings: \"60-80% of storage cost\"\n    - action: \"Drop request/response body logging\"\n      savings: \"20-40% of log volume\"\n  \n  metrics:\n    - action: \"Audit unused metrics (no dashboard, no alert)\"\n      savings: \"10-30% of series\"\n    - action: \"Reduce histogram bucket count (default 11 → 8)\"\n      savings: \"~27% of histogram series\"\n    - action: \"Remove high-cardinality labels\"\n      savings: \"Variable — can be massive\"\n    - action: \"Increase scrape interval for non-critical metrics (15s → 60s)\"\n      savings: \"75% of data points for those metrics\"\n  \n  traces:\n    - action: \"Implement tail-based sampling\"\n      savings: \"80-95% of trace volume\"\n    - action: \"Drop internal health check traces\"\n      savings: \"5-20% of trace volume\"\n    - action: \"Reduce span attribute size (truncate long strings)\"\n      savings: \"10-30% of trace storage\"\n  \n  general:\n    - action: \"Review and right-size retention policies quarterly\"\n    - action: \"Set query timeouts and result limits on dashboards\"\n    - action: \"Use recording rules for expensive queries\"\n\nMonthly Cost Review Template\nobservability_cost_review:\n  month: \"February 2026\"\n  total_cost: \"$X,XXX\"\n  \n  breakdown:\n    logs: { volume: \"X TB\", cost: \"$X\", pct: \"X%\" }\n    metrics: { series: \"X million\", cost: \"$X\", pct: \"X%\" }\n    traces: { volume: \"X TB\", cost: \"$X\", pct: \"X%\" }\n    infrastructure: { instances: X, cost: \"$X\", pct: \"X%\" }\n  \n  cost_per:\n    request: \"$0.000X\"\n    service: \"$X average\"\n    engineer: \"$X per engineer\"\n  \n  optimizations_applied: []\n  optimizations_planned: []\n  budget_status: \"on_track | over_budget | under_budget\"\n\nPhase 12: Advanced Patterns\nCorrelation: Connecting the Three Pillars\nEvery log line includes: trace_id, span_id\nEvery trace span includes: service, operation\nEvery metric includes: service label\n\nCorrelation paths:\n  Alert fires (metric) → Click → Dashboard (metric) → Filter by time window\n    → Trace search (same service + time) → Find failing trace\n    → Logs (filter by trace_id) → See exact error\n    \n  Support ticket (user report) → Find request_id in logs\n    → Extract trace_id → View full trace → Identify slow span\n    → Check span's service metrics → Confirm pattern\n\nSynthetic Monitoring\nsynthetic_checks:\n  - name: \"Checkout flow\"\n    type: browser\n    frequency: 5m\n    locations: [us-east, eu-west, ap-southeast]\n    steps:\n      - navigate: \"https://app.example.com/products\"\n      - click: \"Add to Cart\"\n      - click: \"Checkout\"\n      - assert: \"Order confirmation page loads in <3s\"\n    alert_on: \"2 consecutive failures from same location\"\n    \n  - name: \"API health\"\n    type: api\n    frequency: 1m\n    endpoints:\n      - url: \"https://api.example.com/health\"\n        expected_status: 200\n        max_latency_ms: 500\n      - url: \"https://api.example.com/v1/products?limit=1\"\n        expected_status: 200\n        max_latency_ms: 1000\n\nFeature Flag Observability\n# Correlate feature flags with metrics\nfeature_flag_monitoring:\n  - flag: \"new_checkout_flow\"\n    metrics_to_compare:\n      - \"checkout_conversion_rate\" # by flag variant\n      - \"checkout_error_rate\"\n      - \"checkout_latency_p99\"\n    alerts:\n      - \"If error rate for new variant > 2x control, auto-disable flag\"\n\nObservability Maturity Model\nDimension\tLevel 1\tLevel 2\tLevel 3\tLevel 4\nLogging\tUnstructured logs\tStructured JSON, centralized\tCorrelated with traces\tAutomated log analysis\nMetrics\tBasic infra metrics\tRED/USE for services\tSLO-based with error budgets\tPredictive (anomaly detection)\nTracing\tNo tracing\tKey services instrumented\tFull distributed tracing\tTrace-driven testing\nAlerting\tStatic thresholds\tMulti-signal alerts\tBurn-rate based on SLOs\tAuto-remediation\nIncident Response\tAd hoc\tDefined process + roles\tPost-mortems with action tracking\tChaos engineering in prod\nCulture\t\"Ops team handles it\"\tShared ownership (you build it, you run it)\tSLO-driven development velocity\tReliability as a feature\nQuality Scoring Rubric (0-100)\nDimension\tWeight\t0\t5\t10\nLogging quality\t15%\tUnstructured, no correlation\tStructured JSON, missing fields\tFull schema, trace correlation, PII scrubbing\nMetrics coverage\t15%\tNo metrics\tRED or USE, not both\tRED + USE + business metrics + custom\nTracing completeness\t10%\tNo tracing\tKey services\tFull path, sampling strategy, tail-based\nSLO maturity\t15%\tNo reliability targets\tInformal targets\tSLOs with error budgets, burn-rate alerts, weekly review\nAlert quality\t15%\tNoisy/missing\tActionable, some runbooks\tSLO-based, full runbooks, low false positive\nIncident response\t10%\tAd hoc\tDefined process\tFull process, roles, post-mortems, chaos engineering\nDashboard design\t10%\tNo dashboards\tBasic panels\tHierarchical L1-L4, consistent, linked to alerts\nCost efficiency\t10%\tUnknown cost\tTracked\tOptimized, reviewed monthly, within budget\n\n90-100: World-class. Teach others. 70-89: Production-ready. Fill specific gaps. 50-69: Functional but fragile. <50: Significant reliability risk.\n\n10 Observability Commandments\nStructured or it didn't happen — unstructured logs are technical debt\nCorrelate everything — trace_id connects logs, traces, and metrics\nAlert on symptoms, not causes — users don't care about CPU, they care about latency\nEvery alert gets a runbook — no runbook = no alert\nSLOs drive velocity — error budgets decide when to ship vs stabilize\nDashboards have hierarchy — executives don't need pod CPU graphs\nBlameless post-mortems always — blame prevents learning\nCost is a feature — observability that bankrupts you isn't observability\nYou build it, you run it — the team that ships code owns its observability\nPractice failure — chaos engineering builds confidence\n12 Natural Language Commands\nCommand\tWhat It Does\n\"Audit our observability\"\tRun the /16 health check, score each dimension, prioritize gaps\n\"Design logging for [service]\"\tGenerate structured log schema with context fields for the service\n\"Set up metrics for [service]\"\tCreate RED + USE + business metric instrumentation plan\n\"Create SLOs for [service]\"\tDefine SLIs, targets, error budgets, and burn-rate alert rules\n\"Design alerts for [service]\"\tCreate alert rules with severity, thresholds, and runbook templates\n\"Build dashboard for [service]\"\tDesign L2 service overview dashboard with panel specifications\n\"Write a runbook for [alert]\"\tGenerate structured runbook with diagnosis steps and fixes\n\"Run post-mortem for [incident]\"\tGenerate blameless post-mortem document with timeline and action items\n\"Set up on-call for [team]\"\tDesign rotation, escalation policy, handoff checklist\n\"Plan chaos experiment for [scenario]\"\tDesign experiment with hypothesis, injection, abort conditions\n\"Optimize observability costs\"\tAudit current spend, identify top savings, create reduction plan\n\"Design tracing for [system]\"\tCreate OpenTelemetry instrumentation plan with sampling strategy\n⚡ Level Up Your Observability\n\nThis skill gives you the methodology. For industry-specific implementation patterns:\n\nSaaS companies: AfrexAI SaaS Context Pack ($47) — includes SaaS-specific SLOs, multi-tenant monitoring, and usage-based billing observability\nFintech: AfrexAI Fintech Context Pack ($47) — compliance audit logging, transaction monitoring, fraud detection signals\nHealthcare: AfrexAI Healthcare Context Pack ($47) — HIPAA audit trails, PHI access logging, uptime requirements\n🔗 More Free Skills by AfrexAI\nafrexai-devops-engine — CI/CD, infrastructure, deployment strategies\nafrexai-api-architect — API design, security, versioning\nafrexai-database-engineering — Schema design, query optimization, migrations\nafrexai-code-reviewer — Code review methodology with SPEAR framework\nafrexai-prompt-engineering — System prompt design, testing, optimization\n\nBrowse all AfrexAI skills: clawhub.com | Full storefront"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-observability-engine",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-observability-engine",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-observability-engine",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-observability-engine",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-observability-engine/agent.md"
  }
}