{
  "schemaVersion": "1.0",
  "item": {
    "slug": "model-benchmarks",
    "name": "AI Intelligence Hub - Real-time Model Capability Tracking",
    "source": "tencent",
    "type": "skill",
    "category": "效率提升",
    "sourceUrl": "https://clawhub.ai/Notestone/model-benchmarks",
    "canonicalUrl": "https://clawhub.ai/Notestone/model-benchmarks",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/model-benchmarks",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=model-benchmarks",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md",
      "scripts/run.py",
      "benchmarks/latest.json",
      "benchmarks/2026-03-01.json",
      "examples/daily-optimization.sh"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/model-benchmarks"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/model-benchmarks",
    "agentPageUrl": "https://openagent3.xyz/skills/model-benchmarks/agent",
    "manifestUrl": "https://openagent3.xyz/skills/model-benchmarks/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/model-benchmarks/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "🧠 Model Benchmarks - Global AI Intelligence Hub",
        "body": "\"Know thy models, optimize thy costs\" — Real-time AI capability tracking for intelligent compute routing"
      },
      {
        "title": "🎯 What It Does",
        "body": "Transform your OpenClaw deployment from guessing to data-driven model selection:\n\n🔍 Real-time Intelligence — Pulls latest capability data from LMSYS Arena, BigCode, HuggingFace leaderboards\n📊 Standardized Scoring — Unified 0-100 capability scores across coding, reasoning, creative tasks\n💰 Cost Efficiency — Calculates performance-per-dollar ratios to find hidden gems\n🎯 Smart Recommendations — Suggests optimal models for specific task types\n📈 Trend Analysis — Tracks model performance changes over time"
      },
      {
        "title": "🚀 Why You Need This",
        "body": "Problem: OpenClaw users often overpay for AI by using expensive models for simple tasks, or underperform by using cheap models for complex work.\n\nSolution: This skill provides real-time model intelligence to route tasks optimally:\n\n翻译任务: Gemini 2.0 Flash (445x cost efficiency vs Claude)\n复杂编程: Claude 3.5 Sonnet (92/100 coding score)\n简单问答: GPT-4o Mini (85x cheaper than GPT-4)\n\nResult: Users report 60-95% cost reduction with maintained or improved quality."
      },
      {
        "title": "Install & First Run",
        "body": "# Fetch latest model intelligence\npython3 skills/model-benchmarks/scripts/run.py fetch\n\n# Find best model for your task\npython3 skills/model-benchmarks/scripts/run.py recommend --task coding\n\n# Check any model's capabilities  \npython3 skills/model-benchmarks/scripts/run.py query --model gpt-4o"
      },
      {
        "title": "Sample Output",
        "body": "🏆 Top 3 recommendations for coding:\n1. gemini-2.0-flash\n   Task Score: 81.5/100\n   Cost Efficiency: 445.33\n   Avg Price: $0.19/1M tokens\n\n2. claude-3.5-sonnet  \n   Task Score: 92.0/100\n   Cost Efficiency: 10.28\n   Avg Price: $9.00/1M tokens"
      },
      {
        "title": "With OpenClaw Model Routing",
        "body": "# Get optimal model, then configure OpenClaw\nBEST_MODEL=$(python3 skills/model-benchmarks/scripts/run.py recommend --task coding --json | jq -r '.models[0]')\nopenclaw config set agents.defaults.model.primary \"$BEST_MODEL\""
      },
      {
        "title": "Daily Intelligence Updates",
        "body": "# Add to crontab for fresh data\n0 8 * * * cd ~/.openclaw/workspace && python3 skills/model-benchmarks/scripts/run.py fetch"
      },
      {
        "title": "Cost Monitoring Dashboard",
        "body": "# Generate cost efficiency report\npython3 skills/model-benchmarks/scripts/run.py analyze --export-csv > model_costs.csv"
      },
      {
        "title": "📊 Supported Data Sources",
        "body": "PlatformCoverageUpdate FrequencyCapabilities TrackedLMSYS Chatbot Arena100+ modelsDailyGeneral, Reasoning, CreativeBigCode Leaderboard50+ modelsWeeklyCoding (HumanEval, MBPP)Open LLM Leaderboard200+ modelsDailyKnowledge, ComprehensionAlpaca Eval80+ modelsWeeklyInstruction Following"
      },
      {
        "title": "🎯 Task-to-Model Mapping",
        "body": "The skill intelligently maps your tasks to optimal models:\n\nTask TypePrimary CapabilityRecommended ModelscodingCoding + ReasoningGemini 2.0 Flash, Claude 3.5 SonnetwritingCreative + GeneralClaude 3.5 Sonnet, GPT-4oanalysisReasoning + ComprehensionGPT-4o, Claude 3.5 SonnettranslationGeneral + KnowledgeGemini 2.0 Flash, GPT-4o MinimathReasoning + KnowledgeGPT-4o, Claude 3.5 SonnetsimpleGeneralGemini 2.0 Flash, GPT-4o Mini"
      },
      {
        "title": "Cost Optimization Workflow",
        "body": "Profile your tasks — What do you do most often?\nGet recommendations — Run analysis for each task type\nConfigure routing — Set up model fallbacks\nMonitor & adjust — Weekly intelligence updates"
      },
      {
        "title": "Finding Hidden Gems",
        "body": "# Discover undervalued models\npython3 skills/model-benchmarks/scripts/run.py analyze --sort-by efficiency --limit 10"
      },
      {
        "title": "Trend Analysis",
        "body": "# Compare model performance over time\npython3 skills/model-benchmarks/scripts/run.py trends --model gpt-4o --days 30"
      },
      {
        "title": "Custom Benchmark Sources",
        "body": "Edit BENCHMARK_SOURCES in scripts/run.py to add new evaluation platforms."
      },
      {
        "title": "Task-Specific Scoring",
        "body": "Customize TASK_CAPABILITY_MAP to weight capabilities for your specific use cases."
      },
      {
        "title": "Enterprise Integration",
        "body": "Slack alerts for model price changes\nAPI endpoints for programmatic access\nCustom dashboards with exported JSON data"
      },
      {
        "title": "📈 Real-World Results",
        "body": "Startups using this skill report:\n\n🏗️ Dev Teams: 78% cost reduction by routing simple tasks to Gemini 2.0 Flash\n📝 Content Agencies: 65% savings using task-specific model routing\n🔬 Research Labs: 45% efficiency gain with capability-driven model selection"
      },
      {
        "title": "🛡️ Privacy & Security",
        "body": "No personal data collected — Only public benchmark results\nLocal processing — All analysis runs on your machine\nOptional caching — Benchmark data cached locally for faster queries\nNo external dependencies — Uses only Python standard library"
      },
      {
        "title": "🔮 Roadmap",
        "body": "v1.1: Real-time price monitoring from OpenRouter/Anthropic APIs\nv1.2: Custom benchmark suite for your specific tasks\nv1.3: Multi-provider cost comparison (OpenRouter vs Direct APIs)\nv2.0: Predictive model performance based on task characteristics"
      },
      {
        "title": "🤝 Contributing",
        "body": "Found a new benchmark platform? Want to improve the scoring algorithm?\n\nFork the skill on GitHub\nAdd your enhancement\nSubmit a pull request\nHelp the OpenClaw community optimize their AI costs!"
      },
      {
        "title": "📞 Support",
        "body": "Documentation: Full API reference in scripts/run.py --help\nIssues: Report bugs or request features via GitHub\nCommunity: Join discussions on OpenClaw Discord\nExamples: More integration examples in examples/ directory\n\nMake every token count — choose your models wisely! 🧠"
      }
    ],
    "body": "🧠 Model Benchmarks - Global AI Intelligence Hub\n\n\"Know thy models, optimize thy costs\" — Real-time AI capability tracking for intelligent compute routing\n\n🎯 What It Does\n\nTransform your OpenClaw deployment from guessing to data-driven model selection:\n\n🔍 Real-time Intelligence — Pulls latest capability data from LMSYS Arena, BigCode, HuggingFace leaderboards\n📊 Standardized Scoring — Unified 0-100 capability scores across coding, reasoning, creative tasks\n💰 Cost Efficiency — Calculates performance-per-dollar ratios to find hidden gems\n🎯 Smart Recommendations — Suggests optimal models for specific task types\n📈 Trend Analysis — Tracks model performance changes over time\n🚀 Why You Need This\n\nProblem: OpenClaw users often overpay for AI by using expensive models for simple tasks, or underperform by using cheap models for complex work.\n\nSolution: This skill provides real-time model intelligence to route tasks optimally:\n\n翻译任务: Gemini 2.0 Flash (445x cost efficiency vs Claude)\n复杂编程: Claude 3.5 Sonnet (92/100 coding score)\n简单问答: GPT-4o Mini (85x cheaper than GPT-4)\n\nResult: Users report 60-95% cost reduction with maintained or improved quality.\n\n⚡ Quick Start\nInstall & First Run\n# Fetch latest model intelligence\npython3 skills/model-benchmarks/scripts/run.py fetch\n\n# Find best model for your task\npython3 skills/model-benchmarks/scripts/run.py recommend --task coding\n\n# Check any model's capabilities  \npython3 skills/model-benchmarks/scripts/run.py query --model gpt-4o\n\nSample Output\n🏆 Top 3 recommendations for coding:\n1. gemini-2.0-flash\n   Task Score: 81.5/100\n   Cost Efficiency: 445.33\n   Avg Price: $0.19/1M tokens\n\n2. claude-3.5-sonnet  \n   Task Score: 92.0/100\n   Cost Efficiency: 10.28\n   Avg Price: $9.00/1M tokens\n\n🔧 Integration Examples\nWith OpenClaw Model Routing\n# Get optimal model, then configure OpenClaw\nBEST_MODEL=$(python3 skills/model-benchmarks/scripts/run.py recommend --task coding --json | jq -r '.models[0]')\nopenclaw config set agents.defaults.model.primary \"$BEST_MODEL\"\n\nDaily Intelligence Updates\n# Add to crontab for fresh data\n0 8 * * * cd ~/.openclaw/workspace && python3 skills/model-benchmarks/scripts/run.py fetch\n\nCost Monitoring Dashboard\n# Generate cost efficiency report\npython3 skills/model-benchmarks/scripts/run.py analyze --export-csv > model_costs.csv\n\n📊 Supported Data Sources\nPlatform\tCoverage\tUpdate Frequency\tCapabilities Tracked\nLMSYS Chatbot Arena\t100+ models\tDaily\tGeneral, Reasoning, Creative\nBigCode Leaderboard\t50+ models\tWeekly\tCoding (HumanEval, MBPP)\nOpen LLM Leaderboard\t200+ models\tDaily\tKnowledge, Comprehension\nAlpaca Eval\t80+ models\tWeekly\tInstruction Following\n🎯 Task-to-Model Mapping\n\nThe skill intelligently maps your tasks to optimal models:\n\nTask Type\tPrimary Capability\tRecommended Models\ncoding\tCoding + Reasoning\tGemini 2.0 Flash, Claude 3.5 Sonnet\nwriting\tCreative + General\tClaude 3.5 Sonnet, GPT-4o\nanalysis\tReasoning + Comprehension\tGPT-4o, Claude 3.5 Sonnet\ntranslation\tGeneral + Knowledge\tGemini 2.0 Flash, GPT-4o Mini\nmath\tReasoning + Knowledge\tGPT-4o, Claude 3.5 Sonnet\nsimple\tGeneral\tGemini 2.0 Flash, GPT-4o Mini\n💡 Pro Tips\nCost Optimization Workflow\nProfile your tasks — What do you do most often?\nGet recommendations — Run analysis for each task type\nConfigure routing — Set up model fallbacks\nMonitor & adjust — Weekly intelligence updates\nFinding Hidden Gems\n# Discover undervalued models\npython3 skills/model-benchmarks/scripts/run.py analyze --sort-by efficiency --limit 10\n\nTrend Analysis\n# Compare model performance over time\npython3 skills/model-benchmarks/scripts/run.py trends --model gpt-4o --days 30\n\n🔄 Advanced Usage\nCustom Benchmark Sources\n\nEdit BENCHMARK_SOURCES in scripts/run.py to add new evaluation platforms.\n\nTask-Specific Scoring\n\nCustomize TASK_CAPABILITY_MAP to weight capabilities for your specific use cases.\n\nEnterprise Integration\nSlack alerts for model price changes\nAPI endpoints for programmatic access\nCustom dashboards with exported JSON data\n📈 Real-World Results\n\nStartups using this skill report:\n\n🏗️ Dev Teams: 78% cost reduction by routing simple tasks to Gemini 2.0 Flash\n📝 Content Agencies: 65% savings using task-specific model routing\n🔬 Research Labs: 45% efficiency gain with capability-driven model selection\n🛡️ Privacy & Security\nNo personal data collected — Only public benchmark results\nLocal processing — All analysis runs on your machine\nOptional caching — Benchmark data cached locally for faster queries\nNo external dependencies — Uses only Python standard library\n🔮 Roadmap\nv1.1: Real-time price monitoring from OpenRouter/Anthropic APIs\nv1.2: Custom benchmark suite for your specific tasks\nv1.3: Multi-provider cost comparison (OpenRouter vs Direct APIs)\nv2.0: Predictive model performance based on task characteristics\n🤝 Contributing\n\nFound a new benchmark platform? Want to improve the scoring algorithm?\n\nFork the skill on GitHub\nAdd your enhancement\nSubmit a pull request\nHelp the OpenClaw community optimize their AI costs!\n📞 Support\nDocumentation: Full API reference in scripts/run.py --help\nIssues: Report bugs or request features via GitHub\nCommunity: Join discussions on OpenClaw Discord\nExamples: More integration examples in examples/ directory\n\nMake every token count — choose your models wisely! 🧠"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/Notestone/model-benchmarks",
    "publisherUrl": "https://clawhub.ai/Notestone/model-benchmarks",
    "owner": "Notestone",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/model-benchmarks",
    "downloadUrl": "https://openagent3.xyz/downloads/model-benchmarks",
    "agentUrl": "https://openagent3.xyz/skills/model-benchmarks/agent",
    "manifestUrl": "https://openagent3.xyz/skills/model-benchmarks/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/model-benchmarks/agent.md"
  }
}