{
  "schemaVersion": "1.0",
  "item": {
    "slug": "observability-designer",
    "name": "Observability Designer",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/alirezarezvani/observability-designer",
    "canonicalUrl": "https://clawhub.ai/alirezarezvani/observability-designer",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/observability-designer",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=observability-designer",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md",
      "assets/sample_alerts.json",
      "assets/sample_service_api.json",
      "assets/sample_service_web.json",
      "expected_outputs/sample_dashboard.json"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/observability-designer"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/observability-designer",
    "agentPageUrl": "https://openagent3.xyz/skills/observability-designer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/observability-designer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/observability-designer/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Observability Designer (POWERFUL)",
        "body": "Category: Engineering\nTier: POWERFUL\nDescription: Design comprehensive observability strategies for production systems including SLI/SLO frameworks, alerting optimization, and dashboard generation."
      },
      {
        "title": "Overview",
        "body": "Observability Designer enables you to create production-ready observability strategies that provide deep insights into system behavior, performance, and reliability. This skill combines the three pillars of observability (metrics, logs, traces) with proven frameworks like SLI/SLO design, golden signals monitoring, and alert optimization to create comprehensive observability solutions."
      },
      {
        "title": "SLI/SLO/SLA Framework Design",
        "body": "Service Level Indicators (SLI): Define measurable signals that indicate service health\nService Level Objectives (SLO): Set reliability targets based on user experience\nService Level Agreements (SLA): Establish customer-facing commitments with consequences\nError Budget Management: Calculate and track error budget consumption\nBurn Rate Alerting: Multi-window burn rate alerts for proactive SLO protection"
      },
      {
        "title": "Three Pillars of Observability",
        "body": "Metrics\n\nGolden Signals: Latency, traffic, errors, and saturation monitoring\nRED Method: Rate, Errors, and Duration for request-driven services\nUSE Method: Utilization, Saturation, and Errors for resource monitoring\nBusiness Metrics: Revenue, user engagement, and feature adoption tracking\nInfrastructure Metrics: CPU, memory, disk, network, and custom resource metrics\n\nLogs\n\nStructured Logging: JSON-based log formats with consistent fields\nLog Aggregation: Centralized log collection and indexing strategies\nLog Levels: Appropriate use of DEBUG, INFO, WARN, ERROR, FATAL levels\nCorrelation IDs: Request tracing through distributed systems\nLog Sampling: Volume management for high-throughput systems\n\nTraces\n\nDistributed Tracing: End-to-end request flow visualization\nSpan Design: Meaningful span boundaries and metadata\nTrace Sampling: Intelligent sampling strategies for performance and cost\nService Maps: Automatic dependency discovery through traces\nRoot Cause Analysis: Trace-driven debugging workflows"
      },
      {
        "title": "Dashboard Design Principles",
        "body": "Information Architecture\n\nHierarchy: Overview → Service → Component → Instance drill-down paths\nGolden Ratio: 80% operational metrics, 20% exploratory metrics\nCognitive Load: Maximum 7±2 panels per dashboard screen\nUser Journey: Role-based dashboard personas (SRE, Developer, Executive)\n\nVisualization Best Practices\n\nChart Selection: Time series for trends, heatmaps for distributions, gauges for status\nColor Theory: Red for critical, amber for warning, green for healthy states\nReference Lines: SLO targets, capacity thresholds, and historical baselines\nTime Ranges: Default to meaningful windows (4h for incidents, 7d for trends)\n\nPanel Design\n\nMetric Queries: Efficient Prometheus/InfluxDB queries with proper aggregation\nAlerting Integration: Visual alert state indicators on relevant panels\nInteractive Elements: Template variables, drill-down links, and annotation overlays\nPerformance: Sub-second render times through query optimization"
      },
      {
        "title": "Alert Design and Optimization",
        "body": "Alert Classification\n\nSeverity Levels:\n\nCritical: Service down, SLO burn rate high\nWarning: Approaching thresholds, non-user-facing issues\nInfo: Deployment notifications, capacity planning alerts\n\n\nActionability: Every alert must have a clear response action\nAlert Routing: Escalation policies based on severity and team ownership\n\nAlert Fatigue Prevention\n\nSignal vs Noise: High precision (few false positives) over high recall\nHysteresis: Different thresholds for firing and resolving alerts\nSuppression: Dependent alert suppression during known outages\nGrouping: Related alerts grouped into single notifications\n\nAlert Rule Design\n\nThreshold Selection: Statistical methods for threshold determination\nWindow Functions: Appropriate averaging windows and percentile calculations\nAlert Lifecycle: Clear firing conditions and automatic resolution criteria\nTesting: Alert rule validation against historical data"
      },
      {
        "title": "Runbook Generation and Incident Response",
        "body": "Runbook Structure\n\nAlert Context: What the alert means and why it fired\nImpact Assessment: User-facing vs internal impact evaluation\nInvestigation Steps: Ordered troubleshooting procedures with time estimates\nResolution Actions: Common fixes and escalation procedures\nPost-Incident: Follow-up tasks and prevention measures\n\nIncident Detection Patterns\n\nAnomaly Detection: Statistical methods for detecting unusual patterns\nComposite Alerts: Multi-signal alerts for complex failure modes\nPredictive Alerts: Capacity and trend-based forward-looking alerts\nCanary Monitoring: Early detection through progressive deployment monitoring"
      },
      {
        "title": "Golden Signals Framework",
        "body": "Latency Monitoring\n\nRequest Latency: P50, P95, P99 response time tracking\nQueue Latency: Time spent waiting in processing queues\nNetwork Latency: Inter-service communication delays\nDatabase Latency: Query execution and connection pool metrics\n\nTraffic Monitoring\n\nRequest Rate: Requests per second with burst detection\nBandwidth Usage: Network throughput and capacity utilization\nUser Sessions: Active user tracking and session duration\nFeature Usage: API endpoint and feature adoption metrics\n\nError Monitoring\n\nError Rate: 4xx and 5xx HTTP response code tracking\nError Budget: SLO-based error rate targets and consumption\nError Distribution: Error type classification and trending\nSilent Failures: Detection of processing failures without HTTP errors\n\nSaturation Monitoring\n\nResource Utilization: CPU, memory, disk, and network usage\nQueue Depth: Processing queue length and wait times\nConnection Pools: Database and service connection saturation\nRate Limiting: API throttling and quota exhaustion tracking"
      },
      {
        "title": "Distributed Tracing Strategies",
        "body": "Trace Architecture\n\nSampling Strategy: Head-based, tail-based, and adaptive sampling\nTrace Propagation: Context propagation across service boundaries\nSpan Correlation: Parent-child relationship modeling\nTrace Storage: Retention policies and storage optimization\n\nService Instrumentation\n\nAuto-Instrumentation: Framework-based automatic trace generation\nManual Instrumentation: Custom span creation for business logic\nBaggage Handling: Cross-cutting concern propagation\nPerformance Impact: Instrumentation overhead measurement and optimization"
      },
      {
        "title": "Log Aggregation Patterns",
        "body": "Collection Architecture\n\nAgent Deployment: Log shipping agent strategies (push vs pull)\nLog Routing: Topic-based routing and filtering\nParsing Strategies: Structured vs unstructured log handling\nSchema Evolution: Log format versioning and migration\n\nStorage and Indexing\n\nIndex Design: Optimized field indexing for common query patterns\nRetention Policies: Time and volume-based log retention\nCompression: Log data compression and archival strategies\nSearch Performance: Query optimization and result caching"
      },
      {
        "title": "Cost Optimization for Observability",
        "body": "Data Management\n\nMetric Retention: Tiered retention based on metric importance\nLog Sampling: Intelligent sampling to reduce ingestion costs\nTrace Sampling: Cost-effective trace collection strategies\nData Archival: Cold storage for historical observability data\n\nResource Optimization\n\nQuery Efficiency: Optimized metric and log queries\nStorage Costs: Appropriate storage tiers for different data types\nIngestion Rate Limiting: Controlled data ingestion to manage costs\nCardinality Management: High-cardinality metric detection and mitigation"
      },
      {
        "title": "Scripts Overview",
        "body": "This skill includes three powerful Python scripts for comprehensive observability design:"
      },
      {
        "title": "1. SLO Designer (slo_designer.py)",
        "body": "Generates complete SLI/SLO frameworks based on service characteristics:\n\nInput: Service description JSON (type, criticality, dependencies)\nOutput: SLI definitions, SLO targets, error budgets, burn rate alerts, SLA recommendations\nFeatures: Multi-window burn rate calculations, error budget policies, alert rule generation"
      },
      {
        "title": "2. Alert Optimizer (alert_optimizer.py)",
        "body": "Analyzes and optimizes existing alert configurations:\n\nInput: Alert configuration JSON with rules, thresholds, and routing\nOutput: Optimization report and improved alert configuration\nFeatures: Noise detection, coverage gaps, duplicate identification, threshold optimization"
      },
      {
        "title": "3. Dashboard Generator (dashboard_generator.py)",
        "body": "Creates comprehensive dashboard specifications:\n\nInput: Service/system description JSON\nOutput: Grafana-compatible dashboard JSON and documentation\nFeatures: Golden signals coverage, RED/USE methods, drill-down paths, role-based views"
      },
      {
        "title": "Monitoring Stack Integration",
        "body": "Prometheus: Metric collection and alerting rule generation\nGrafana: Dashboard creation and visualization configuration\nElasticsearch/Kibana: Log analysis and dashboard integration\nJaeger/Zipkin: Distributed tracing configuration and analysis"
      },
      {
        "title": "CI/CD Integration",
        "body": "Pipeline Monitoring: Build, test, and deployment observability\nDeployment Correlation: Release impact tracking and rollback triggers\nFeature Flag Monitoring: A/B test and feature rollout observability\nPerformance Regression: Automated performance monitoring in pipelines"
      },
      {
        "title": "Incident Management Integration",
        "body": "PagerDuty/VictorOps: Alert routing and escalation policies\nSlack/Teams: Notification and collaboration integration\nJIRA/ServiceNow: Incident tracking and resolution workflows\nPost-Mortem: Automated incident analysis and improvement tracking"
      },
      {
        "title": "Multi-Cloud Observability",
        "body": "Cross-Cloud Metrics: Unified metrics across AWS, GCP, Azure\nNetwork Observability: Inter-cloud connectivity monitoring\nCost Attribution: Cloud resource cost tracking and optimization\nCompliance Monitoring: Security and compliance posture tracking"
      },
      {
        "title": "Microservices Observability",
        "body": "Service Mesh Integration: Istio/Linkerd observability configuration\nAPI Gateway Monitoring: Request routing and rate limiting observability\nContainer Orchestration: Kubernetes cluster and workload monitoring\nService Discovery: Dynamic service monitoring and health checks"
      },
      {
        "title": "Machine Learning Observability",
        "body": "Model Performance: Accuracy, drift, and bias monitoring\nFeature Store Monitoring: Feature quality and freshness tracking\nPipeline Observability: ML pipeline execution and performance monitoring\nA/B Test Analysis: Statistical significance and business impact measurement"
      },
      {
        "title": "Organizational Alignment",
        "body": "SLO Setting: Collaborative target setting between product and engineering\nAlert Ownership: Clear escalation paths and team responsibilities\nDashboard Governance: Centralized dashboard management and standards\nTraining Programs: Team education on observability tools and practices"
      },
      {
        "title": "Technical Excellence",
        "body": "Infrastructure as Code: Observability configuration version control\nTesting Strategy: Alert rule testing and dashboard validation\nPerformance Monitoring: Observability system performance tracking\nSecurity Considerations: Access control and data privacy in observability"
      },
      {
        "title": "Continuous Improvement",
        "body": "Metrics Review: Regular SLI/SLO effectiveness assessment\nAlert Tuning: Ongoing alert threshold and routing optimization\nDashboard Evolution: User feedback-driven dashboard improvements\nTool Evaluation: Regular assessment of observability tool effectiveness"
      },
      {
        "title": "Operational Metrics",
        "body": "Mean Time to Detection (MTTD): How quickly issues are identified\nMean Time to Resolution (MTTR): Time from detection to resolution\nAlert Precision: Percentage of actionable alerts\nSLO Achievement: Percentage of SLO targets met consistently"
      },
      {
        "title": "Business Metrics",
        "body": "System Reliability: Overall uptime and user experience quality\nEngineering Velocity: Development team productivity and deployment frequency\nCost Efficiency: Observability cost as percentage of infrastructure spend\nCustomer Satisfaction: User-reported reliability and performance satisfaction\n\nThis comprehensive observability design skill enables organizations to build robust, scalable monitoring and alerting systems that provide actionable insights while maintaining cost efficiency and operational excellence."
      }
    ],
    "body": "Observability Designer (POWERFUL)\n\nCategory: Engineering\nTier: POWERFUL\nDescription: Design comprehensive observability strategies for production systems including SLI/SLO frameworks, alerting optimization, and dashboard generation.\n\nOverview\n\nObservability Designer enables you to create production-ready observability strategies that provide deep insights into system behavior, performance, and reliability. This skill combines the three pillars of observability (metrics, logs, traces) with proven frameworks like SLI/SLO design, golden signals monitoring, and alert optimization to create comprehensive observability solutions.\n\nCore Competencies\nSLI/SLO/SLA Framework Design\nService Level Indicators (SLI): Define measurable signals that indicate service health\nService Level Objectives (SLO): Set reliability targets based on user experience\nService Level Agreements (SLA): Establish customer-facing commitments with consequences\nError Budget Management: Calculate and track error budget consumption\nBurn Rate Alerting: Multi-window burn rate alerts for proactive SLO protection\nThree Pillars of Observability\nMetrics\nGolden Signals: Latency, traffic, errors, and saturation monitoring\nRED Method: Rate, Errors, and Duration for request-driven services\nUSE Method: Utilization, Saturation, and Errors for resource monitoring\nBusiness Metrics: Revenue, user engagement, and feature adoption tracking\nInfrastructure Metrics: CPU, memory, disk, network, and custom resource metrics\nLogs\nStructured Logging: JSON-based log formats with consistent fields\nLog Aggregation: Centralized log collection and indexing strategies\nLog Levels: Appropriate use of DEBUG, INFO, WARN, ERROR, FATAL levels\nCorrelation IDs: Request tracing through distributed systems\nLog Sampling: Volume management for high-throughput systems\nTraces\nDistributed Tracing: End-to-end request flow visualization\nSpan Design: Meaningful span boundaries and metadata\nTrace Sampling: Intelligent sampling strategies for performance and cost\nService Maps: Automatic dependency discovery through traces\nRoot Cause Analysis: Trace-driven debugging workflows\nDashboard Design Principles\nInformation Architecture\nHierarchy: Overview → Service → Component → Instance drill-down paths\nGolden Ratio: 80% operational metrics, 20% exploratory metrics\nCognitive Load: Maximum 7±2 panels per dashboard screen\nUser Journey: Role-based dashboard personas (SRE, Developer, Executive)\nVisualization Best Practices\nChart Selection: Time series for trends, heatmaps for distributions, gauges for status\nColor Theory: Red for critical, amber for warning, green for healthy states\nReference Lines: SLO targets, capacity thresholds, and historical baselines\nTime Ranges: Default to meaningful windows (4h for incidents, 7d for trends)\nPanel Design\nMetric Queries: Efficient Prometheus/InfluxDB queries with proper aggregation\nAlerting Integration: Visual alert state indicators on relevant panels\nInteractive Elements: Template variables, drill-down links, and annotation overlays\nPerformance: Sub-second render times through query optimization\nAlert Design and Optimization\nAlert Classification\nSeverity Levels:\nCritical: Service down, SLO burn rate high\nWarning: Approaching thresholds, non-user-facing issues\nInfo: Deployment notifications, capacity planning alerts\nActionability: Every alert must have a clear response action\nAlert Routing: Escalation policies based on severity and team ownership\nAlert Fatigue Prevention\nSignal vs Noise: High precision (few false positives) over high recall\nHysteresis: Different thresholds for firing and resolving alerts\nSuppression: Dependent alert suppression during known outages\nGrouping: Related alerts grouped into single notifications\nAlert Rule Design\nThreshold Selection: Statistical methods for threshold determination\nWindow Functions: Appropriate averaging windows and percentile calculations\nAlert Lifecycle: Clear firing conditions and automatic resolution criteria\nTesting: Alert rule validation against historical data\nRunbook Generation and Incident Response\nRunbook Structure\nAlert Context: What the alert means and why it fired\nImpact Assessment: User-facing vs internal impact evaluation\nInvestigation Steps: Ordered troubleshooting procedures with time estimates\nResolution Actions: Common fixes and escalation procedures\nPost-Incident: Follow-up tasks and prevention measures\nIncident Detection Patterns\nAnomaly Detection: Statistical methods for detecting unusual patterns\nComposite Alerts: Multi-signal alerts for complex failure modes\nPredictive Alerts: Capacity and trend-based forward-looking alerts\nCanary Monitoring: Early detection through progressive deployment monitoring\nGolden Signals Framework\nLatency Monitoring\nRequest Latency: P50, P95, P99 response time tracking\nQueue Latency: Time spent waiting in processing queues\nNetwork Latency: Inter-service communication delays\nDatabase Latency: Query execution and connection pool metrics\nTraffic Monitoring\nRequest Rate: Requests per second with burst detection\nBandwidth Usage: Network throughput and capacity utilization\nUser Sessions: Active user tracking and session duration\nFeature Usage: API endpoint and feature adoption metrics\nError Monitoring\nError Rate: 4xx and 5xx HTTP response code tracking\nError Budget: SLO-based error rate targets and consumption\nError Distribution: Error type classification and trending\nSilent Failures: Detection of processing failures without HTTP errors\nSaturation Monitoring\nResource Utilization: CPU, memory, disk, and network usage\nQueue Depth: Processing queue length and wait times\nConnection Pools: Database and service connection saturation\nRate Limiting: API throttling and quota exhaustion tracking\nDistributed Tracing Strategies\nTrace Architecture\nSampling Strategy: Head-based, tail-based, and adaptive sampling\nTrace Propagation: Context propagation across service boundaries\nSpan Correlation: Parent-child relationship modeling\nTrace Storage: Retention policies and storage optimization\nService Instrumentation\nAuto-Instrumentation: Framework-based automatic trace generation\nManual Instrumentation: Custom span creation for business logic\nBaggage Handling: Cross-cutting concern propagation\nPerformance Impact: Instrumentation overhead measurement and optimization\nLog Aggregation Patterns\nCollection Architecture\nAgent Deployment: Log shipping agent strategies (push vs pull)\nLog Routing: Topic-based routing and filtering\nParsing Strategies: Structured vs unstructured log handling\nSchema Evolution: Log format versioning and migration\nStorage and Indexing\nIndex Design: Optimized field indexing for common query patterns\nRetention Policies: Time and volume-based log retention\nCompression: Log data compression and archival strategies\nSearch Performance: Query optimization and result caching\nCost Optimization for Observability\nData Management\nMetric Retention: Tiered retention based on metric importance\nLog Sampling: Intelligent sampling to reduce ingestion costs\nTrace Sampling: Cost-effective trace collection strategies\nData Archival: Cold storage for historical observability data\nResource Optimization\nQuery Efficiency: Optimized metric and log queries\nStorage Costs: Appropriate storage tiers for different data types\nIngestion Rate Limiting: Controlled data ingestion to manage costs\nCardinality Management: High-cardinality metric detection and mitigation\nScripts Overview\n\nThis skill includes three powerful Python scripts for comprehensive observability design:\n\n1. SLO Designer (slo_designer.py)\n\nGenerates complete SLI/SLO frameworks based on service characteristics:\n\nInput: Service description JSON (type, criticality, dependencies)\nOutput: SLI definitions, SLO targets, error budgets, burn rate alerts, SLA recommendations\nFeatures: Multi-window burn rate calculations, error budget policies, alert rule generation\n2. Alert Optimizer (alert_optimizer.py)\n\nAnalyzes and optimizes existing alert configurations:\n\nInput: Alert configuration JSON with rules, thresholds, and routing\nOutput: Optimization report and improved alert configuration\nFeatures: Noise detection, coverage gaps, duplicate identification, threshold optimization\n3. Dashboard Generator (dashboard_generator.py)\n\nCreates comprehensive dashboard specifications:\n\nInput: Service/system description JSON\nOutput: Grafana-compatible dashboard JSON and documentation\nFeatures: Golden signals coverage, RED/USE methods, drill-down paths, role-based views\nIntegration Patterns\nMonitoring Stack Integration\nPrometheus: Metric collection and alerting rule generation\nGrafana: Dashboard creation and visualization configuration\nElasticsearch/Kibana: Log analysis and dashboard integration\nJaeger/Zipkin: Distributed tracing configuration and analysis\nCI/CD Integration\nPipeline Monitoring: Build, test, and deployment observability\nDeployment Correlation: Release impact tracking and rollback triggers\nFeature Flag Monitoring: A/B test and feature rollout observability\nPerformance Regression: Automated performance monitoring in pipelines\nIncident Management Integration\nPagerDuty/VictorOps: Alert routing and escalation policies\nSlack/Teams: Notification and collaboration integration\nJIRA/ServiceNow: Incident tracking and resolution workflows\nPost-Mortem: Automated incident analysis and improvement tracking\nAdvanced Patterns\nMulti-Cloud Observability\nCross-Cloud Metrics: Unified metrics across AWS, GCP, Azure\nNetwork Observability: Inter-cloud connectivity monitoring\nCost Attribution: Cloud resource cost tracking and optimization\nCompliance Monitoring: Security and compliance posture tracking\nMicroservices Observability\nService Mesh Integration: Istio/Linkerd observability configuration\nAPI Gateway Monitoring: Request routing and rate limiting observability\nContainer Orchestration: Kubernetes cluster and workload monitoring\nService Discovery: Dynamic service monitoring and health checks\nMachine Learning Observability\nModel Performance: Accuracy, drift, and bias monitoring\nFeature Store Monitoring: Feature quality and freshness tracking\nPipeline Observability: ML pipeline execution and performance monitoring\nA/B Test Analysis: Statistical significance and business impact measurement\nBest Practices\nOrganizational Alignment\nSLO Setting: Collaborative target setting between product and engineering\nAlert Ownership: Clear escalation paths and team responsibilities\nDashboard Governance: Centralized dashboard management and standards\nTraining Programs: Team education on observability tools and practices\nTechnical Excellence\nInfrastructure as Code: Observability configuration version control\nTesting Strategy: Alert rule testing and dashboard validation\nPerformance Monitoring: Observability system performance tracking\nSecurity Considerations: Access control and data privacy in observability\nContinuous Improvement\nMetrics Review: Regular SLI/SLO effectiveness assessment\nAlert Tuning: Ongoing alert threshold and routing optimization\nDashboard Evolution: User feedback-driven dashboard improvements\nTool Evaluation: Regular assessment of observability tool effectiveness\nSuccess Metrics\nOperational Metrics\nMean Time to Detection (MTTD): How quickly issues are identified\nMean Time to Resolution (MTTR): Time from detection to resolution\nAlert Precision: Percentage of actionable alerts\nSLO Achievement: Percentage of SLO targets met consistently\nBusiness Metrics\nSystem Reliability: Overall uptime and user experience quality\nEngineering Velocity: Development team productivity and deployment frequency\nCost Efficiency: Observability cost as percentage of infrastructure spend\nCustomer Satisfaction: User-reported reliability and performance satisfaction\n\nThis comprehensive observability design skill enables organizations to build robust, scalable monitoring and alerting systems that provide actionable insights while maintaining cost efficiency and operational excellence."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/alirezarezvani/observability-designer",
    "publisherUrl": "https://clawhub.ai/alirezarezvani/observability-designer",
    "owner": "alirezarezvani",
    "version": "2.1.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/observability-designer",
    "downloadUrl": "https://openagent3.xyz/downloads/observability-designer",
    "agentUrl": "https://openagent3.xyz/skills/observability-designer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/observability-designer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/observability-designer/agent.md"
  }
}