{
  "schemaVersion": "1.0",
  "item": {
    "slug": "prometheus-devops",
    "name": "Prometheus",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/wpank/prometheus-devops",
    "canonicalUrl": "https://clawhub.ai/wpank/prometheus-devops",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/prometheus-devops",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=prometheus-devops",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md",
      "templates/recording-rules.yml",
      "templates/alert-rules.yml",
      "templates/prometheus.yml"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "slug": "prometheus-devops",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-05-07T11:16:56.222Z",
      "expiresAt": "2026-05-14T11:16:56.222Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=prometheus-devops",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=prometheus-devops",
        "contentDisposition": "attachment; filename=\"prometheus-devops-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null,
        "slug": "prometheus-devops"
      },
      "scope": "item",
      "summary": "Item download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this item.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/prometheus-devops"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/prometheus-devops",
    "agentPageUrl": "https://openagent3.xyz/skills/prometheus-devops/agent",
    "manifestUrl": "https://openagent3.xyz/skills/prometheus-devops/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/prometheus-devops/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Prometheus",
        "body": "Production Prometheus setup covering scrape configuration, service discovery,\nrecording rules, alert rules, and operational best practices for infrastructure\nand application monitoring."
      },
      {
        "title": "When to Use",
        "body": "ScenarioExampleSet up metrics collectionNew service needs Prometheus scrapingConfigure service discoveryK8s pods, file-based, or static targetsCreate recording rulesPre-compute expensive PromQL queriesDesign alert rulesSLO-based alerts for availability and latencyProduction deploymentHA setup with retention and storage planningTroubleshoot scrapingTargets down, metrics missing, relabeling issues"
      },
      {
        "title": "Architecture",
        "body": "Applications ──(/metrics)──→ Prometheus Server ──→ AlertManager → Slack/PD\n      ↑                           │\n  client libraries          ├──→ Grafana (dashboards)\n  (prom client)             └──→ Thanos/Cortex (long-term storage)"
      },
      {
        "title": "Kubernetes (Helm)",
        "body": "helm repo add prometheus-community \\\n  https://prometheus-community.github.io/helm-charts\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n  --namespace monitoring --create-namespace \\\n  --set prometheus.prometheusSpec.retention=30d \\\n  --set prometheus.prometheusSpec.storageVolumeSize=50Gi"
      },
      {
        "title": "prometheus.yml",
        "body": "global:\n  scrape_interval: 15s\n  evaluation_interval: 15s\n  external_labels:\n    cluster: production\n    region: us-west-2\n\nalerting:\n  alertmanagers:\n    - static_configs:\n        - targets: [\"alertmanager:9093\"]\n\nrule_files:\n  - /etc/prometheus/rules/*.yml\n\nscrape_configs:\n  # Self-monitoring\n  - job_name: prometheus\n    static_configs:\n      - targets: [\"localhost:9090\"]\n\n  # Node exporters\n  - job_name: node-exporter\n    static_configs:\n      - targets: [\"node1:9100\", \"node2:9100\", \"node3:9100\"]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: instance\n        regex: \"([^:]+)(:[0-9]+)?\"\n        replacement: \"${1}\"\n\n  # Application metrics (TLS)\n  - job_name: my-app\n    scheme: https\n    metrics_path: /metrics\n    tls_config:\n      ca_file: /etc/prometheus/ca.crt\n    static_configs:\n      - targets: [\"app1:9090\", \"app2:9090\"]"
      },
      {
        "title": "Kubernetes Pods (Annotation-Based)",
        "body": "scrape_configs:\n  - job_name: kubernetes-pods\n    kubernetes_sd_configs:\n      - role: pod\n    relabel_configs:\n      - source_labels:\n          [__meta_kubernetes_pod_annotation_prometheus_io_scrape]\n        action: keep\n        regex: true\n      - source_labels:\n          [__meta_kubernetes_pod_annotation_prometheus_io_path]\n        action: replace\n        target_label: __metrics_path__\n        regex: (.+)\n      - source_labels:\n          [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]\n        action: replace\n        regex: ([^:]+)(?::\\d+)?;(\\d+)\n        replacement: $1:$2\n        target_label: __address__\n      - source_labels: [__meta_kubernetes_namespace]\n        target_label: namespace\n      - source_labels: [__meta_kubernetes_pod_name]\n        target_label: pod\n\nPod annotations to enable scraping:\n\nmetadata:\n  annotations:\n    prometheus.io/scrape: \"true\"\n    prometheus.io/port: \"9090\"\n    prometheus.io/path: \"/metrics\""
      },
      {
        "title": "File-Based Discovery",
        "body": "scrape_configs:\n  - job_name: file-sd\n    file_sd_configs:\n      - files: [\"/etc/prometheus/targets/*.json\"]\n        refresh_interval: 5m\n\ntargets/production.json:\n\n[{\n  \"targets\": [\"app1:9090\", \"app2:9090\"],\n  \"labels\": { \"env\": \"production\", \"service\": \"api\" }\n}]"
      },
      {
        "title": "Discovery Method Comparison",
        "body": "MethodBest ForDynamicstatic_configsFixed infrastructure, devNofile_sd_configsCM-managed inventoriesYes (file watch)kubernetes_sd_configsK8s workloadsYes (API watch)consul_sd_configsConsul service meshYes (Consul watch)ec2_sd_configsAWS EC2 instancesYes (API poll)"
      },
      {
        "title": "Recording Rules",
        "body": "Pre-compute expensive queries for dashboard and alert performance:\n\n# /etc/prometheus/rules/recording_rules.yml\ngroups:\n  - name: api_metrics\n    interval: 15s\n    rules:\n      - record: job:http_requests:rate5m\n        expr: sum by (job) (rate(http_requests_total[5m]))\n\n      - record: job:http_errors:rate5m\n        expr: sum by (job) (rate(http_requests_total{status=~\"5..\"}[5m]))\n\n      - record: job:http_error_rate:ratio\n        expr: job:http_errors:rate5m / job:http_requests:rate5m\n\n      - record: job:http_duration:p95\n        expr: >\n          histogram_quantile(0.95,\n            sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))\n          )\n\n  - name: resource_metrics\n    interval: 30s\n    rules:\n      - record: instance:node_cpu:utilization\n        expr: >\n          100 - (avg by (instance)\n            (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)\n\n      - record: instance:node_memory:utilization\n        expr: >\n          100 - ((node_memory_MemAvailable_bytes\n            / node_memory_MemTotal_bytes) * 100)\n\n      - record: instance:node_disk:utilization\n        expr: >\n          100 - ((node_filesystem_avail_bytes\n            / node_filesystem_size_bytes) * 100)"
      },
      {
        "title": "Naming Convention",
        "body": "level:metric_name:operations\n\nPartExampleMeaningleveljob:, instance:Aggregation levelmetric_namehttp_requestsBase metricoperations:rate5m, :ratioApplied functions"
      },
      {
        "title": "Alert Rules",
        "body": "# /etc/prometheus/rules/alert_rules.yml\ngroups:\n  - name: availability\n    rules:\n      - alert: ServiceDown\n        expr: up{job=\"my-app\"} == 0\n        for: 1m\n        labels:\n          severity: critical\n        annotations:\n          summary: \"{{ $labels.instance }} is down\"\n          description: \"{{ $labels.job }} down for >1 minute\"\n\n      - alert: HighErrorRate\n        expr: job:http_error_rate:ratio > 0.05\n        for: 5m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"Error rate {{ $value | humanizePercentage }} for {{ $labels.job }}\"\n\n      - alert: HighP95Latency\n        expr: job:http_duration:p95 > 1\n        for: 5m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"P95 latency {{ $value }}s for {{ $labels.job }}\"\n\n  - name: resources\n    rules:\n      - alert: HighCPU\n        expr: instance:node_cpu:utilization > 80\n        for: 5m\n        labels: { severity: warning }\n        annotations:\n          summary: \"CPU {{ $value }}% on {{ $labels.instance }}\"\n\n      - alert: HighMemory\n        expr: instance:node_memory:utilization > 85\n        for: 5m\n        labels: { severity: warning }\n        annotations:\n          summary: \"Memory {{ $value }}% on {{ $labels.instance }}\"\n\n      - alert: DiskSpaceLow\n        expr: instance:node_disk:utilization > 90\n        for: 5m\n        labels: { severity: critical }\n        annotations:\n          summary: \"Disk {{ $value }}% on {{ $labels.instance }}\""
      },
      {
        "title": "Alert Severity Guide",
        "body": "SeverityThresholdResponsecriticalService down, data loss riskPage on-call immediatelywarningDegraded, approaching limitInvestigate within hoursinfoNotable but not urgentReview in next business day"
      },
      {
        "title": "Validation",
        "body": "# Validate config syntax\npromtool check config prometheus.yml\n\n# Validate rule files\npromtool check rules /etc/prometheus/rules/*.yml\n\n# Test a query\npromtool query instant http://localhost:9090 'up'\n\n# Reload config without restart\ncurl -X POST http://localhost:9090/-/reload"
      },
      {
        "title": "Best Practices",
        "body": "PracticeDetailNaming: prefix_name_unitSnake_case, _total for counters, _seconds/_bytes for unitsScrape intervals 15–60sShorter wastes resources and storageRecording rules for dashboardsPre-compute anything queried repeatedlyMonitor Prometheus itselfprometheus_tsdb_*, scrape_duration_secondsHA deployment2+ instances scraping same targetsRetention planningMatch --storage.tsdb.retention.time to disk capacityFederation for scaleGlobal Prometheus aggregates from regional instancesLong-term storageThanos or Cortex for >30d retention"
      },
      {
        "title": "Troubleshooting Quick Reference",
        "body": "ProblemDiagnosisFixTarget shows DOWNCheck /targets page for errorFix firewall, verify endpoint, check TLSMetrics missingQuery up{job=\"x\"}Verify scrape config, check /metrics endpointHigh cardinalityprometheus_tsdb_head_series growingDrop high-cardinality labels with metric_relabel_configsStorage filling upCheck prometheus_tsdb_storage_*Reduce retention, add disk, enable compactionSlow queriesCheck prometheus_engine_query_duration_secondsAdd recording rules, reduce range, limit seriesConfig not appliedCheck prometheus_config_last_reload_successfulFix syntax, POST /-/reload"
      },
      {
        "title": "NEVER Do",
        "body": "Anti-PatternWhyDo InsteadScrape interval < 5sOverwhelms targets and storageUse 15–60s intervalsHigh-cardinality labels (user ID, request ID)Explodes TSDB series countUse logs for high-cardinality dataAlert without for durationFires on transient spikesAlways set for: 1m minimumSkip recording rulesDashboards compute expensive queries every loadPre-compute with recording rulesStore secrets in prometheus.ymlConfig often in GitUse file-based secrets or env substitutionIgnore up metricMiss targets silently going downAlert on up == 0 for all jobsSingle Prometheus instance in prodSingle point of failureRun 2+ replicas with shared targetsUnbounded retentionDisk fills, Prometheus crashesSet explicit --storage.tsdb.retention.time"
      },
      {
        "title": "Templates",
        "body": "TemplateDescriptiontemplates/prometheus.ymlFull config with static, file-based, and K8s discoverytemplates/alert-rules.yml25+ alert rules by categorytemplates/recording-rules.ymlPre-computed metrics for HTTP, latency, resources, SLOs"
      }
    ],
    "body": "Prometheus\n\nProduction Prometheus setup covering scrape configuration, service discovery, recording rules, alert rules, and operational best practices for infrastructure and application monitoring.\n\nWhen to Use\nScenario\tExample\nSet up metrics collection\tNew service needs Prometheus scraping\nConfigure service discovery\tK8s pods, file-based, or static targets\nCreate recording rules\tPre-compute expensive PromQL queries\nDesign alert rules\tSLO-based alerts for availability and latency\nProduction deployment\tHA setup with retention and storage planning\nTroubleshoot scraping\tTargets down, metrics missing, relabeling issues\nArchitecture\nApplications ──(/metrics)──→ Prometheus Server ──→ AlertManager → Slack/PD\n      ↑                           │\n  client libraries          ├──→ Grafana (dashboards)\n  (prom client)             └──→ Thanos/Cortex (long-term storage)\n\nInstallation\nKubernetes (Helm)\nhelm repo add prometheus-community \\\n  https://prometheus-community.github.io/helm-charts\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n  --namespace monitoring --create-namespace \\\n  --set prometheus.prometheusSpec.retention=30d \\\n  --set prometheus.prometheusSpec.storageVolumeSize=50Gi\n\nCore Configuration\nprometheus.yml\nglobal:\n  scrape_interval: 15s\n  evaluation_interval: 15s\n  external_labels:\n    cluster: production\n    region: us-west-2\n\nalerting:\n  alertmanagers:\n    - static_configs:\n        - targets: [\"alertmanager:9093\"]\n\nrule_files:\n  - /etc/prometheus/rules/*.yml\n\nscrape_configs:\n  # Self-monitoring\n  - job_name: prometheus\n    static_configs:\n      - targets: [\"localhost:9090\"]\n\n  # Node exporters\n  - job_name: node-exporter\n    static_configs:\n      - targets: [\"node1:9100\", \"node2:9100\", \"node3:9100\"]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: instance\n        regex: \"([^:]+)(:[0-9]+)?\"\n        replacement: \"${1}\"\n\n  # Application metrics (TLS)\n  - job_name: my-app\n    scheme: https\n    metrics_path: /metrics\n    tls_config:\n      ca_file: /etc/prometheus/ca.crt\n    static_configs:\n      - targets: [\"app1:9090\", \"app2:9090\"]\n\nService Discovery\nKubernetes Pods (Annotation-Based)\nscrape_configs:\n  - job_name: kubernetes-pods\n    kubernetes_sd_configs:\n      - role: pod\n    relabel_configs:\n      - source_labels:\n          [__meta_kubernetes_pod_annotation_prometheus_io_scrape]\n        action: keep\n        regex: true\n      - source_labels:\n          [__meta_kubernetes_pod_annotation_prometheus_io_path]\n        action: replace\n        target_label: __metrics_path__\n        regex: (.+)\n      - source_labels:\n          [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]\n        action: replace\n        regex: ([^:]+)(?::\\d+)?;(\\d+)\n        replacement: $1:$2\n        target_label: __address__\n      - source_labels: [__meta_kubernetes_namespace]\n        target_label: namespace\n      - source_labels: [__meta_kubernetes_pod_name]\n        target_label: pod\n\n\nPod annotations to enable scraping:\n\nmetadata:\n  annotations:\n    prometheus.io/scrape: \"true\"\n    prometheus.io/port: \"9090\"\n    prometheus.io/path: \"/metrics\"\n\nFile-Based Discovery\nscrape_configs:\n  - job_name: file-sd\n    file_sd_configs:\n      - files: [\"/etc/prometheus/targets/*.json\"]\n        refresh_interval: 5m\n\n\ntargets/production.json:\n\n[{\n  \"targets\": [\"app1:9090\", \"app2:9090\"],\n  \"labels\": { \"env\": \"production\", \"service\": \"api\" }\n}]\n\nDiscovery Method Comparison\nMethod\tBest For\tDynamic\nstatic_configs\tFixed infrastructure, dev\tNo\nfile_sd_configs\tCM-managed inventories\tYes (file watch)\nkubernetes_sd_configs\tK8s workloads\tYes (API watch)\nconsul_sd_configs\tConsul service mesh\tYes (Consul watch)\nec2_sd_configs\tAWS EC2 instances\tYes (API poll)\nRecording Rules\n\nPre-compute expensive queries for dashboard and alert performance:\n\n# /etc/prometheus/rules/recording_rules.yml\ngroups:\n  - name: api_metrics\n    interval: 15s\n    rules:\n      - record: job:http_requests:rate5m\n        expr: sum by (job) (rate(http_requests_total[5m]))\n\n      - record: job:http_errors:rate5m\n        expr: sum by (job) (rate(http_requests_total{status=~\"5..\"}[5m]))\n\n      - record: job:http_error_rate:ratio\n        expr: job:http_errors:rate5m / job:http_requests:rate5m\n\n      - record: job:http_duration:p95\n        expr: >\n          histogram_quantile(0.95,\n            sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))\n          )\n\n  - name: resource_metrics\n    interval: 30s\n    rules:\n      - record: instance:node_cpu:utilization\n        expr: >\n          100 - (avg by (instance)\n            (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)\n\n      - record: instance:node_memory:utilization\n        expr: >\n          100 - ((node_memory_MemAvailable_bytes\n            / node_memory_MemTotal_bytes) * 100)\n\n      - record: instance:node_disk:utilization\n        expr: >\n          100 - ((node_filesystem_avail_bytes\n            / node_filesystem_size_bytes) * 100)\n\nNaming Convention\nlevel:metric_name:operations\n\nPart\tExample\tMeaning\nlevel\tjob:, instance:\tAggregation level\nmetric_name\thttp_requests\tBase metric\noperations\t:rate5m, :ratio\tApplied functions\nAlert Rules\n# /etc/prometheus/rules/alert_rules.yml\ngroups:\n  - name: availability\n    rules:\n      - alert: ServiceDown\n        expr: up{job=\"my-app\"} == 0\n        for: 1m\n        labels:\n          severity: critical\n        annotations:\n          summary: \"{{ $labels.instance }} is down\"\n          description: \"{{ $labels.job }} down for >1 minute\"\n\n      - alert: HighErrorRate\n        expr: job:http_error_rate:ratio > 0.05\n        for: 5m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"Error rate {{ $value | humanizePercentage }} for {{ $labels.job }}\"\n\n      - alert: HighP95Latency\n        expr: job:http_duration:p95 > 1\n        for: 5m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"P95 latency {{ $value }}s for {{ $labels.job }}\"\n\n  - name: resources\n    rules:\n      - alert: HighCPU\n        expr: instance:node_cpu:utilization > 80\n        for: 5m\n        labels: { severity: warning }\n        annotations:\n          summary: \"CPU {{ $value }}% on {{ $labels.instance }}\"\n\n      - alert: HighMemory\n        expr: instance:node_memory:utilization > 85\n        for: 5m\n        labels: { severity: warning }\n        annotations:\n          summary: \"Memory {{ $value }}% on {{ $labels.instance }}\"\n\n      - alert: DiskSpaceLow\n        expr: instance:node_disk:utilization > 90\n        for: 5m\n        labels: { severity: critical }\n        annotations:\n          summary: \"Disk {{ $value }}% on {{ $labels.instance }}\"\n\nAlert Severity Guide\nSeverity\tThreshold\tResponse\ncritical\tService down, data loss risk\tPage on-call immediately\nwarning\tDegraded, approaching limit\tInvestigate within hours\ninfo\tNotable but not urgent\tReview in next business day\nValidation\n# Validate config syntax\npromtool check config prometheus.yml\n\n# Validate rule files\npromtool check rules /etc/prometheus/rules/*.yml\n\n# Test a query\npromtool query instant http://localhost:9090 'up'\n\n# Reload config without restart\ncurl -X POST http://localhost:9090/-/reload\n\nBest Practices\nPractice\tDetail\nNaming: prefix_name_unit\tSnake_case, _total for counters, _seconds/_bytes for units\nScrape intervals 15–60s\tShorter wastes resources and storage\nRecording rules for dashboards\tPre-compute anything queried repeatedly\nMonitor Prometheus itself\tprometheus_tsdb_*, scrape_duration_seconds\nHA deployment\t2+ instances scraping same targets\nRetention planning\tMatch --storage.tsdb.retention.time to disk capacity\nFederation for scale\tGlobal Prometheus aggregates from regional instances\nLong-term storage\tThanos or Cortex for >30d retention\nTroubleshooting Quick Reference\nProblem\tDiagnosis\tFix\nTarget shows DOWN\tCheck /targets page for error\tFix firewall, verify endpoint, check TLS\nMetrics missing\tQuery up{job=\"x\"}\tVerify scrape config, check /metrics endpoint\nHigh cardinality\tprometheus_tsdb_head_series growing\tDrop high-cardinality labels with metric_relabel_configs\nStorage filling up\tCheck prometheus_tsdb_storage_*\tReduce retention, add disk, enable compaction\nSlow queries\tCheck prometheus_engine_query_duration_seconds\tAdd recording rules, reduce range, limit series\nConfig not applied\tCheck prometheus_config_last_reload_successful\tFix syntax, POST /-/reload\nNEVER Do\nAnti-Pattern\tWhy\tDo Instead\nScrape interval < 5s\tOverwhelms targets and storage\tUse 15–60s intervals\nHigh-cardinality labels (user ID, request ID)\tExplodes TSDB series count\tUse logs for high-cardinality data\nAlert without for duration\tFires on transient spikes\tAlways set for: 1m minimum\nSkip recording rules\tDashboards compute expensive queries every load\tPre-compute with recording rules\nStore secrets in prometheus.yml\tConfig often in Git\tUse file-based secrets or env substitution\nIgnore up metric\tMiss targets silently going down\tAlert on up == 0 for all jobs\nSingle Prometheus instance in prod\tSingle point of failure\tRun 2+ replicas with shared targets\nUnbounded retention\tDisk fills, Prometheus crashes\tSet explicit --storage.tsdb.retention.time\nTemplates\nTemplate\tDescription\ntemplates/prometheus.yml\tFull config with static, file-based, and K8s discovery\ntemplates/alert-rules.yml\t25+ alert rules by category\ntemplates/recording-rules.yml\tPre-computed metrics for HTTP, latency, resources, SLOs"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/wpank/prometheus-devops",
    "publisherUrl": "https://clawhub.ai/wpank/prometheus-devops",
    "owner": "wpank",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/prometheus-devops",
    "downloadUrl": "https://openagent3.xyz/downloads/prometheus-devops",
    "agentUrl": "https://openagent3.xyz/skills/prometheus-devops/agent",
    "manifestUrl": "https://openagent3.xyz/skills/prometheus-devops/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/prometheus-devops/agent.md"
  }
}