{
  "schemaVersion": "1.0",
  "item": {
    "slug": "logging-observability",
    "name": "Logging Observability",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/wpank/logging-observability",
    "canonicalUrl": "https://clawhub.ai/wpank/logging-observability",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/logging-observability",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=logging-observability",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/logging-observability"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/logging-observability",
    "agentPageUrl": "https://openagent3.xyz/skills/logging-observability/agent",
    "manifestUrl": "https://openagent3.xyz/skills/logging-observability/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/logging-observability/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Logging & Observability",
        "body": "Patterns for building observable systems across the three pillars: logs, metrics, and traces."
      },
      {
        "title": "Three Pillars",
        "body": "PillarPurposeQuestion It AnswersExampleLogsWhat happenedWhy did this request fail?{\"level\":\"error\",\"msg\":\"payment declined\",\"user_id\":\"u_82\"}MetricsHow much / how fastIs latency increasing?http_request_duration_seconds{route=\"/api/orders\"} 0.342TracesRequest flowWhere is the bottleneck?Span: api-gateway → auth → order-service → db\n\nEach pillar is strongest when correlated. Embed trace_id in every log line to jump from a log entry to the full distributed trace."
      },
      {
        "title": "Structured Logging",
        "body": "Always emit logs as structured JSON — never free-text strings."
      },
      {
        "title": "Required Fields",
        "body": "FieldPurposeRequiredtimestampISO-8601 with millisecondsYeslevelSeverity (DEBUG … FATAL)YesserviceOriginating service nameYesmessageHuman-readable descriptionYestrace_idDistributed trace correlationYesspan_idCurrent span within traceYescorrelation_idBusiness-level correlation (order ID)When applicableerrorStructured error objectOn errorscontextRequest-specific metadataRecommended"
      },
      {
        "title": "Context Enrichment",
        "body": "Attach context at the middleware level so downstream logs inherit automatically:\n\napp.use((req, res, next) => {\n  const ctx = {\n    trace_id: req.headers['x-trace-id'] || crypto.randomUUID(),\n    request_id: crypto.randomUUID(),\n    user_id: req.user?.id,\n    method: req.method,\n    path: req.path,\n  };\n  asyncLocalStorage.run(ctx, () => next());\n});"
      },
      {
        "title": "Library Recommendations",
        "body": "LibraryLanguageStrengthsPerfPinoNode.jsFastest Node logger, low overheadExcellentstructlogPythonComposable processors, context bindingGoodzerologGoZero-allocation JSON loggingExcellentzapGoHigh performance, typed fieldsExcellenttracingRustSpans + events, async-awareExcellent\n\nChoose a logger that outputs structured JSON natively. Avoid loggers requiring post-processing."
      },
      {
        "title": "Log Levels",
        "body": "LevelWhen to UseExampleFATALApp cannot continue, process will exitDatabase connection pool exhaustedERROROperation failed, needs attentionPayment charge failed: CARD_DECLINEDWARNUnexpected but recoverableRetry 2/3 for upstream timeoutINFONormal business eventsOrder ORD-1234 placed successfullyDEBUGDeveloper troubleshootingCache miss for key user:82:preferencesTRACEVery fine-grained (rarely in prod)Entering validateAddress with payload\n\nRules: Production default = INFO and above. If you log an ERROR, someone should act on it. Every FATAL should trigger an alert."
      },
      {
        "title": "OpenTelemetry Setup",
        "body": "Always prefer OpenTelemetry over vendor-specific SDKs:\n\nimport { NodeSDK } from '@opentelemetry/sdk-node';\nimport { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';\nimport { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';\n\nconst sdk = new NodeSDK({\n  serviceName: 'order-service',\n  traceExporter: new OTLPTraceExporter({\n    url: 'http://otel-collector:4318/v1/traces',\n  }),\n  instrumentations: [getNodeAutoInstrumentations()],\n});\nsdk.start();"
      },
      {
        "title": "Span Creation",
        "body": "const tracer = trace.getTracer('order-service');\n\nasync function processOrder(order: Order) {\n  return tracer.startActiveSpan('processOrder', async (span) => {\n    try {\n      span.setAttribute('order.id', order.id);\n      span.setAttribute('order.total_cents', order.totalCents);\n      await validateInventory(order);\n      await chargePayment(order);\n      span.setStatus({ code: SpanStatusCode.OK });\n    } catch (err) {\n      span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });\n      span.recordException(err);\n      throw err;\n    } finally {\n      span.end();\n    }\n  });\n}"
      },
      {
        "title": "Context Propagation",
        "body": "Use W3C Trace Context (traceparent header) — default in OTel\nPropagate across HTTP, gRPC, and message queues\nFor async workers: serialise traceparent into the job payload"
      },
      {
        "title": "Trace Sampling",
        "body": "StrategyUse WhenAlways OnLow-traffic services, debuggingProbabilistic (N%)General production useRate-limited (N/sec)High-throughput servicesTail-basedWhen you need all error traces\n\nAlways sample 100% of error traces regardless of strategy."
      },
      {
        "title": "RED Method (Request-Driven)",
        "body": "Monitor these three for every service endpoint:\n\nMetricWhat It MeasuresPrometheus ExampleRateRequests/secrate(http_requests_total[5m])ErrorsFailed request ratiorate(http_requests_total{status=~\"5..\"}[5m])DurationResponse timehistogram_quantile(0.99, http_request_duration_seconds)"
      },
      {
        "title": "USE Method (Resource-Driven)",
        "body": "For infrastructure components (CPU, memory, disk, network):\n\nMetricWhat It MeasuresExampleUtilization% resource busyCPU usage at 78%SaturationWork queued/waiting12 requests queued in thread poolErrorsError events on resource3 disk I/O errors in last minute"
      },
      {
        "title": "Monitoring Stack",
        "body": "ToolCategoryBest ForPrometheusMetricsPull-based metrics, alerting rulesGrafanaVisualisationDashboards for metrics, logs, tracesJaegerTracingDistributed trace visualisationLokiLogsLog aggregation (pairs with Grafana)OpenTelemetryCollectionVendor-neutral telemetry collection\n\nRecommendation: Start with OTel Collector → Prometheus + Grafana + Loki + Jaeger. Migrate to SaaS only when operational overhead justifies cost."
      },
      {
        "title": "Severity Levels",
        "body": "SeverityResponse TimeExampleP1ImmediateService fully down, data lossP2< 30 minError rate > 5%, latency p99 > 5sP3Business hoursDisk > 80%, cert expiring in 7 daysP4Best effortNon-critical deprecation warning"
      },
      {
        "title": "Alert Fatigue Prevention",
        "body": "Alert on symptoms, not causes — \"error rate > 5%\" not \"pod restarted\"\nMulti-window, multi-burn-rate — catch both sudden spikes and slow burns\nRequire runbook links — every alert must link to diagnosis and remediation\nReview monthly — delete or tune alerts that never fire or always fire\nGroup related alerts — use inhibition rules to suppress child alerts\nSet appropriate thresholds — if alert fires daily and is ignored, raise threshold or delete"
      },
      {
        "title": "Overview Dashboard (\"War Room\")",
        "body": "Total requests/sec across all services\nGlobal error rate (%) with trendline\np50 / p95 / p99 latency\nActive alerts count by severity\nDeployment markers overlaid on graphs"
      },
      {
        "title": "Service Dashboard (Per-Service)",
        "body": "RED metrics for each endpoint\nDependency health (upstream/downstream success rates)\nResource utilisation (CPU, memory, connections)\nTop errors table with count and last seen"
      },
      {
        "title": "Observability Checklist",
        "body": "Every service must have:\n\nStructured JSON logging with consistent schema\n Correlation / trace IDs propagated on all requests\n RED metrics exposed for every external endpoint\n Health check endpoints (/healthz and /readyz)\n Distributed tracing with OpenTelemetry\n Dashboards for RED metrics and resource utilisation\n Alerts for error rate, latency, and saturation with runbook links\n Log level configurable at runtime without redeployment\n PII scrubbing verified and tested\n Retention policies defined for logs, metrics, and traces"
      },
      {
        "title": "Anti-Patterns",
        "body": "Anti-PatternProblemFixLogging PIIPrivacy/compliance violationMask or exclude PII; use token referencesExcessive loggingStorage costs balloon, signal drownsLog business events, not data flowUnstructured logsCannot query or alert on fieldsUse structured JSON with consistent schemaString interpolationBreaks structured fields, injection riskPass fields as metadata, not in messageMissing correlation IDsCannot trace across servicesGenerate and propagate trace_id everywhereAlert stormsOn-call fatigue, real issues buriedUse grouping, inhibition, deduplicationMetrics with high cardinalityPrometheus OOM, dashboard timeoutsNever use user ID or request ID as label"
      },
      {
        "title": "NEVER Do",
        "body": "NEVER log passwords, tokens, API keys, or secrets — even at DEBUG level\nNEVER use console.log / print in production — use a structured logger\nNEVER use user IDs, emails, or request IDs as metric labels — cardinality will explode\nNEVER create alerts without a runbook link — unactionable alerts erode trust\nNEVER rely on logs alone — you need metrics and traces for full observability\nNEVER log request/response bodies by default — opt-in only, with PII redaction\nNEVER ignore log volume — set budgets and alert when a service exceeds daily quota\nNEVER skip context propagation in async flows — broken traces are worse than no traces"
      }
    ],
    "body": "Logging & Observability\n\nPatterns for building observable systems across the three pillars: logs, metrics, and traces.\n\nThree Pillars\nPillar\tPurpose\tQuestion It Answers\tExample\nLogs\tWhat happened\tWhy did this request fail?\t{\"level\":\"error\",\"msg\":\"payment declined\",\"user_id\":\"u_82\"}\nMetrics\tHow much / how fast\tIs latency increasing?\thttp_request_duration_seconds{route=\"/api/orders\"} 0.342\nTraces\tRequest flow\tWhere is the bottleneck?\tSpan: api-gateway → auth → order-service → db\n\nEach pillar is strongest when correlated. Embed trace_id in every log line to jump from a log entry to the full distributed trace.\n\nStructured Logging\n\nAlways emit logs as structured JSON — never free-text strings.\n\nRequired Fields\nField\tPurpose\tRequired\ntimestamp\tISO-8601 with milliseconds\tYes\nlevel\tSeverity (DEBUG … FATAL)\tYes\nservice\tOriginating service name\tYes\nmessage\tHuman-readable description\tYes\ntrace_id\tDistributed trace correlation\tYes\nspan_id\tCurrent span within trace\tYes\ncorrelation_id\tBusiness-level correlation (order ID)\tWhen applicable\nerror\tStructured error object\tOn errors\ncontext\tRequest-specific metadata\tRecommended\nContext Enrichment\n\nAttach context at the middleware level so downstream logs inherit automatically:\n\napp.use((req, res, next) => {\n  const ctx = {\n    trace_id: req.headers['x-trace-id'] || crypto.randomUUID(),\n    request_id: crypto.randomUUID(),\n    user_id: req.user?.id,\n    method: req.method,\n    path: req.path,\n  };\n  asyncLocalStorage.run(ctx, () => next());\n});\n\nLibrary Recommendations\nLibrary\tLanguage\tStrengths\tPerf\nPino\tNode.js\tFastest Node logger, low overhead\tExcellent\nstructlog\tPython\tComposable processors, context binding\tGood\nzerolog\tGo\tZero-allocation JSON logging\tExcellent\nzap\tGo\tHigh performance, typed fields\tExcellent\ntracing\tRust\tSpans + events, async-aware\tExcellent\n\nChoose a logger that outputs structured JSON natively. Avoid loggers requiring post-processing.\n\nLog Levels\nLevel\tWhen to Use\tExample\nFATAL\tApp cannot continue, process will exit\tDatabase connection pool exhausted\nERROR\tOperation failed, needs attention\tPayment charge failed: CARD_DECLINED\nWARN\tUnexpected but recoverable\tRetry 2/3 for upstream timeout\nINFO\tNormal business events\tOrder ORD-1234 placed successfully\nDEBUG\tDeveloper troubleshooting\tCache miss for key user:82:preferences\nTRACE\tVery fine-grained (rarely in prod)\tEntering validateAddress with payload\n\nRules: Production default = INFO and above. If you log an ERROR, someone should act on it. Every FATAL should trigger an alert.\n\nDistributed Tracing\nOpenTelemetry Setup\n\nAlways prefer OpenTelemetry over vendor-specific SDKs:\n\nimport { NodeSDK } from '@opentelemetry/sdk-node';\nimport { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';\nimport { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';\n\nconst sdk = new NodeSDK({\n  serviceName: 'order-service',\n  traceExporter: new OTLPTraceExporter({\n    url: 'http://otel-collector:4318/v1/traces',\n  }),\n  instrumentations: [getNodeAutoInstrumentations()],\n});\nsdk.start();\n\nSpan Creation\nconst tracer = trace.getTracer('order-service');\n\nasync function processOrder(order: Order) {\n  return tracer.startActiveSpan('processOrder', async (span) => {\n    try {\n      span.setAttribute('order.id', order.id);\n      span.setAttribute('order.total_cents', order.totalCents);\n      await validateInventory(order);\n      await chargePayment(order);\n      span.setStatus({ code: SpanStatusCode.OK });\n    } catch (err) {\n      span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });\n      span.recordException(err);\n      throw err;\n    } finally {\n      span.end();\n    }\n  });\n}\n\nContext Propagation\nUse W3C Trace Context (traceparent header) — default in OTel\nPropagate across HTTP, gRPC, and message queues\nFor async workers: serialise traceparent into the job payload\nTrace Sampling\nStrategy\tUse When\nAlways On\tLow-traffic services, debugging\nProbabilistic (N%)\tGeneral production use\nRate-limited (N/sec)\tHigh-throughput services\nTail-based\tWhen you need all error traces\n\nAlways sample 100% of error traces regardless of strategy.\n\nMetrics Collection\nRED Method (Request-Driven)\n\nMonitor these three for every service endpoint:\n\nMetric\tWhat It Measures\tPrometheus Example\nRate\tRequests/sec\trate(http_requests_total[5m])\nErrors\tFailed request ratio\trate(http_requests_total{status=~\"5..\"}[5m])\nDuration\tResponse time\thistogram_quantile(0.99, http_request_duration_seconds)\nUSE Method (Resource-Driven)\n\nFor infrastructure components (CPU, memory, disk, network):\n\nMetric\tWhat It Measures\tExample\nUtilization\t% resource busy\tCPU usage at 78%\nSaturation\tWork queued/waiting\t12 requests queued in thread pool\nErrors\tError events on resource\t3 disk I/O errors in last minute\nMonitoring Stack\nTool\tCategory\tBest For\nPrometheus\tMetrics\tPull-based metrics, alerting rules\nGrafana\tVisualisation\tDashboards for metrics, logs, traces\nJaeger\tTracing\tDistributed trace visualisation\nLoki\tLogs\tLog aggregation (pairs with Grafana)\nOpenTelemetry\tCollection\tVendor-neutral telemetry collection\n\nRecommendation: Start with OTel Collector → Prometheus + Grafana + Loki + Jaeger. Migrate to SaaS only when operational overhead justifies cost.\n\nAlert Design\nSeverity Levels\nSeverity\tResponse Time\tExample\nP1\tImmediate\tService fully down, data loss\nP2\t< 30 min\tError rate > 5%, latency p99 > 5s\nP3\tBusiness hours\tDisk > 80%, cert expiring in 7 days\nP4\tBest effort\tNon-critical deprecation warning\nAlert Fatigue Prevention\nAlert on symptoms, not causes — \"error rate > 5%\" not \"pod restarted\"\nMulti-window, multi-burn-rate — catch both sudden spikes and slow burns\nRequire runbook links — every alert must link to diagnosis and remediation\nReview monthly — delete or tune alerts that never fire or always fire\nGroup related alerts — use inhibition rules to suppress child alerts\nSet appropriate thresholds — if alert fires daily and is ignored, raise threshold or delete\nDashboard Patterns\nOverview Dashboard (\"War Room\")\nTotal requests/sec across all services\nGlobal error rate (%) with trendline\np50 / p95 / p99 latency\nActive alerts count by severity\nDeployment markers overlaid on graphs\nService Dashboard (Per-Service)\nRED metrics for each endpoint\nDependency health (upstream/downstream success rates)\nResource utilisation (CPU, memory, connections)\nTop errors table with count and last seen\nObservability Checklist\n\nEvery service must have:\n\n Structured JSON logging with consistent schema\n Correlation / trace IDs propagated on all requests\n RED metrics exposed for every external endpoint\n Health check endpoints (/healthz and /readyz)\n Distributed tracing with OpenTelemetry\n Dashboards for RED metrics and resource utilisation\n Alerts for error rate, latency, and saturation with runbook links\n Log level configurable at runtime without redeployment\n PII scrubbing verified and tested\n Retention policies defined for logs, metrics, and traces\nAnti-Patterns\nAnti-Pattern\tProblem\tFix\nLogging PII\tPrivacy/compliance violation\tMask or exclude PII; use token references\nExcessive logging\tStorage costs balloon, signal drowns\tLog business events, not data flow\nUnstructured logs\tCannot query or alert on fields\tUse structured JSON with consistent schema\nString interpolation\tBreaks structured fields, injection risk\tPass fields as metadata, not in message\nMissing correlation IDs\tCannot trace across services\tGenerate and propagate trace_id everywhere\nAlert storms\tOn-call fatigue, real issues buried\tUse grouping, inhibition, deduplication\nMetrics with high cardinality\tPrometheus OOM, dashboard timeouts\tNever use user ID or request ID as label\nNEVER Do\nNEVER log passwords, tokens, API keys, or secrets — even at DEBUG level\nNEVER use console.log / print in production — use a structured logger\nNEVER use user IDs, emails, or request IDs as metric labels — cardinality will explode\nNEVER create alerts without a runbook link — unactionable alerts erode trust\nNEVER rely on logs alone — you need metrics and traces for full observability\nNEVER log request/response bodies by default — opt-in only, with PII redaction\nNEVER ignore log volume — set budgets and alert when a service exceeds daily quota\nNEVER skip context propagation in async flows — broken traces are worse than no traces"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/wpank/logging-observability",
    "publisherUrl": "https://clawhub.ai/wpank/logging-observability",
    "owner": "wpank",
    "version": "0.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/logging-observability",
    "downloadUrl": "https://openagent3.xyz/downloads/logging-observability",
    "agentUrl": "https://openagent3.xyz/skills/logging-observability/agent",
    "manifestUrl": "https://openagent3.xyz/skills/logging-observability/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/logging-observability/agent.md"
  }
}