{
  "schemaVersion": "1.0",
  "item": {
    "slug": "modelshow",
    "name": "Modelshow",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/schbz/modelshow",
    "canonicalUrl": "https://clawhub.ai/schbz/modelshow",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/modelshow",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=modelshow",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "blind_judge_manager.py",
      "config.json",
      "judge_pipeline.py",
      "README.md",
      "save_results.py",
      "skill.json"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/modelshow"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/modelshow",
    "agentPageUrl": "https://openagent3.xyz/skills/modelshow/agent",
    "manifestUrl": "https://openagent3.xyz/skills/modelshow/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/modelshow/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "ModelShow — Professional Multi-Model Evaluation",
        "body": "ModelShow provides a sophisticated framework for comparing AI model responses through double-blind evaluation. The system queries multiple models in parallel, anonymizes their outputs, and uses an independent judge model to rank responses purely on merit."
      },
      {
        "title": "Key Features",
        "body": "Architecturally Guaranteed De-anonymization: The judge sub-agent automatically de-anonymizes results before returning them—orchestrators never see placeholder labels\nCryptographic Randomization: Responses are presented to the judge in cryptographically secure random order using secrets.SystemRandom()\nHolistic Judge Analysis: Judges provide both per-model rankings and comprehensive \"Overall Assessment\" analyzing cross-model patterns\nIntelligent Polling: Automatic progress monitoring with content-free status updates and immediate completion detection\nProfessional Output: Formatted results with scores, judge commentary, and actionable insights"
      },
      {
        "title": "Detection",
        "body": "Trigger: Message starts with mdls or modelshow (case-insensitive). Extract the prompt by removing the trigger keyword.\n\nExample: mdls explain quantum entanglement → prompt = explain quantum entanglement"
      },
      {
        "title": "Workflow",
        "body": "Step 1  → Acknowledge & Load Configuration\nStep 2  → Spawn Parallel Model Agents\nStep 3  → Collect Responses with Intelligent Polling\nStep 4  → Anonymize with Cryptographic Randomization\nStep 5  → Spawn Judge+Deanon Sub-Agent\nStep 6  → Parse De-anonymized Results\nStep 7  → Build Formatted Output\nStep 8  → Save Results (optionally update web index via update_modelshow_index.py)"
      },
      {
        "title": "Step 1: Acknowledge & Load Configuration",
        "body": "Immediate Response:\n\n🔄 ModelShow starting — querying models in parallel.\nResults will appear automatically when judging is complete.\n\nLoad Configuration: Read {baseDir}/config.json for model list, judge model, timeouts, and other settings."
      },
      {
        "title": "Step 2: Spawn Parallel Model Agents",
        "body": "For each model in config.models:\n\nModel: The model alias (e.g., pro, grok, kimi)\nLabel: mdls-{model}-{timestamp} (unique identifier)\nTimeout: config.timeoutSeconds (default: 360 seconds)\nTask:\n{config.systemPrompt}\n\n{extracted user prompt}\n\nParallel Execution: If config.parallel is true, spawn all agents simultaneously.\n\nContext Handling: If the prompt references external content (URLs, files, preferences), fetch and prepend this context to the task."
      },
      {
        "title": "Step 3: Collect Responses with Intelligent Polling",
        "body": "Polling Strategy:\n\nPoll every 20 seconds\nExit immediately when all agents complete\nMinimum 3 polls before considering timeout\nMaximum runtime: config.timeoutSeconds\n\nStatus Updates (content-free):\n\n⏳ Models responding... {done}/{total} complete. ({elapsed}s elapsed)\n✅ All {N} models responded. Sending to judge...\n\nResponse Collection:\n\ncollected_responses = {\n  \"model_name\": {\n    \"status\": \"completed\" | \"failed\" | \"timeout\",\n    \"text\": \"response text or empty string\",\n    \"duration_seconds\": duration\n  }\n}\n\nMinimum Success Check: If successful responses < config.minSuccessful, abort with informative message."
      },
      {
        "title": "Step 4: Anonymize with Cryptographic Randomization",
        "body": "Execute the anonymization pipeline:\n\necho '{\n  \"action\": \"anonymize\",\n  \"responses\": {model: response_dict},\n  \"label_style\": \"alphabetic\",\n  \"shuffle\": true\n}' | python3 {baseDir}/judge_pipeline.py\n\nKey Features:\n\nshuffle: true ensures cryptographically random response order\nLabels are assigned as \"Response A\", \"Response B\", etc.\nanonymization_map tracks label-to-model mapping for later de-anonymization"
      },
      {
        "title": "Step 5: Spawn Judge+Deanon Sub-Agent",
        "body": "The judge sub-agent performs both evaluation and de-anonymization in a single atomic operation:\n\nJudge Task Structure:\n\nYou are an impartial judge AND a data processor.\n\nYour task has TWO parts. Complete BOTH before returning anything.\n\n═══════════════════════════════════════════════════════════\nPART 1: JUDGE THE RESPONSES\n═══════════════════════════════════════════════════════════\n\n[Blind responses with placeholder labels]\n\n═══════════════════════════════════════════════════════════\nPART 2: PROCESS YOUR JUDGMENT\n═══════════════════════════════════════════════════════════\n\n1. Write your judgment evaluating Response A, Response B, etc.\n2. Include scores (1-10) for each response\n3. Provide an \"Overall Assessment\" section analyzing cross-model patterns\n\nAfter writing your judgment, run this command:\n\necho '{\n  \"action\": \"finalize\",\n  \"judge_output\": \"[YOUR JUDGMENT TEXT HERE]\",\n  \"anonymization_map\": {anonymization_map}\n}' | python3 {baseDir}/judge_pipeline.py\n\nReturn ONLY the JSON output from that command.\n\nJudge Model: Uses config.judgeModel (e.g. sonnet, gemini31or)"
      },
      {
        "title": "Step 6: Parse De-anonymized Results",
        "body": "The judge sub-agent returns:\n\ndeanonymized_judge_output: Full judgment with real model names\nranked_models_deanonymized: Structured ranking data\ndeanonymization_complete: Boolean verification\n\nArchitectural Guarantee: The orchestrator never receives placeholder labels—only de-anonymized results."
      },
      {
        "title": "Step 7: Build Formatted Output",
        "body": "Create professional presentation:\n\n🕶️ Double-Blind Judging Results:\n\n🏆 Model Name (Score: X.X/10)\n[Full response text]\nJudge's assessment: [Commentary]\n\n🥈 Second Place (Score: X.X/10)\n[Full response text]\nJudge's assessment: [Commentary]\n\n📊 Overall Assessment:\n[Judge's holistic analysis of cross-model patterns]\n\n⚠️ WARNING: Step 8 is MANDATORY.\nSending results to the user is NOT the end of the task.\nYou MUST execute Step 8 (save_results) immediately after sending.\nThe task is NOT complete until save_results.py returns {\"success\": true}.\nDo NOT skip, defer, or omit Step 8 under any circumstances."
      },
      {
        "title": "Step 8: Save Results",
        "body": "🚨 THIS STEP IS REQUIRED. DO NOT SKIP.\nAfter every ModelShow run, you MUST save the results using save_results.py.\nFailure to save means the results are not persisted for later use.\n\nSave to config.outputDir (default: ~/.openclaw/workspace/modelshow-results):\n\nJSON: {config.outputDir}/{slug}-{timestamp}.json\nMarkdown: {config.outputDir}/{slug}-{timestamp}.md\n\nExact JSON payload to pipe into save_results.py:\n\n{\n  \"prompt\": \"<the original user prompt>\",\n  \"timestamp\": \"<ISO 8601 timestamp, e.g. 2026-03-08T01:00:00Z>\",\n  \"models\": [\"model1\", \"model2\", \"model3\"],\n  \"judge_model\": \"<config.judgeModel>\",\n  \"output_dir\": \"<config.outputDir>\",\n  \"ranked_results\": [\n    {\n      \"rank\": 1,\n      \"model\": \"model_alias\",\n      \"score\": 9.5,\n      \"judge_notes\": \"Judge's per-model commentary here\",\n      \"response_text\": \"The full model response text here\"\n    },\n    {\n      \"rank\": 2,\n      \"model\": \"model_alias\",\n      \"score\": 8.0,\n      \"judge_notes\": \"Judge's per-model commentary here\",\n      \"response_text\": \"The full model response text here\"\n    }\n  ],\n  \"deanonymized_judge_output\": \"<full judge output text with real model names>\",\n  \"anonymization_map\": {\n    \"Response A\": \"model_alias_1\",\n    \"Response B\": \"model_alias_2\"\n  },\n  \"metadata\": {\n    \"total_duration_ms\": 45000,\n    \"successful_models\": 4,\n    \"failed_models\": 0,\n    \"timed_out_models\": [\"deepseek\"]\n  }\n}\n\nExecute the save command:\n\necho '<JSON payload above>' | python3 {baseDir}/save_results.py\n\nVerify success: The script MUST return {\"success\": true, ...}. If it returns an error, fix and retry. Do NOT proceed without a successful save.\n\nOptional: For building a local index of result files (e.g. for a custom dashboard or static site) or for web display (e.g. rexuvia.com), see update_modelshow_index.py. This is not part of the mandatory workflow.\n\n✅ Only after save_results.py returns success is the ModelShow task complete."
      },
      {
        "title": "Configuration (config.json)",
        "body": "KeyDescriptionDefaultkeywordPrimary trigger\"mdls\"alternativeKeywordsAlso trigger on[\"modelshow\"]modelsList of model aliases to compare[\"pro\", \"sonnet\", \"deepseek\", \"gpt4\", \"grok\", \"kimi\"]judgeModelModel for double-blind evaluation\"sonnet\"outputDirWhere to save result files\"~/.openclaw/workspace/modelshow-results\"timeoutSecondsMaximum wait time per model360minSuccessfulMinimum responses to proceed2parallelRun models in paralleltrueshowTopNNumber of top results to display10includeResponseTextInclude full responses in outputtrueblindJudgingEnable anonymizationtrueblindJudgingLabelsLabel style for anonymization\"alphabetic\"shuffleBlindOrderRandomize response ordertrue"
      },
      {
        "title": "File Structure",
        "body": "modelshow/\n├── SKILL.md              # This documentation\n├── config.json           # Configuration settings\n├── judge_pipeline.py     # Anonymization & de-anonymization pipeline\n├── save_results.py       # Result saving with holistic assessment extraction\n├── update_modelshow_index.py # Optional: build local index / web index\n├── blind_judge_manager.py # Anonymization utility (legacy)\n├── README.md             # User documentation\n└── .gitignore            # Git exclusions"
      },
      {
        "title": "judge_pipeline.py",
        "body": "Core pipeline for anonymization and de-anonymization:\n\naction: \"anonymize\": Creates cryptographically randomized blind responses\naction: \"finalize\": De-anonymizes judge output and extracts rankings"
      },
      {
        "title": "save_results.py",
        "body": "Saves results in both JSON and Markdown formats with specialized extraction of the \"Overall Assessment\" section from judge output. Results are written to config.outputDir for local use, scripting, or your own tooling."
      },
      {
        "title": "update_modelshow_index.py",
        "body": "Optional utility to build a local index of result JSON files (e.g. for a custom dashboard or static site) or to update the web index for rexuvia.com. Not required for the core workflow."
      },
      {
        "title": "Usage Examples",
        "body": "Basic Comparison:\n\nmdls explain the difference between TCP and UDP\n\nCreative Task:\n\nmdls write a short poem about working late at night\n\nTechnical Analysis:\n\nmdls pros and cons of event sourcing vs traditional CRUD\n\nCode Review:\n\nmdls review this Python function for potential issues: [code]"
      },
      {
        "title": "Best Practices",
        "body": "Prompt Clarity: Provide clear, specific prompts for meaningful comparisons\nModel Selection: Choose models with complementary strengths for the task type\nContext Inclusion: Reference relevant context when appropriate\nResult Interpretation: Consider both scores and the judge's holistic assessment\nTailor config: Update config.json to match the models available on your instance\nWeb Integration: Optionally use update_modelshow_index.py to publish results"
      },
      {
        "title": "Integration Points",
        "body": "Local storage: Results are saved as JSON and Markdown in config.outputDir for local use, scripting, or your own tooling\nWeb display: Use update_modelshow_index.py to make results available online\nCron Automation: Can be scheduled for regular comparative analysis\nAPI Access: JSON results enable programmatic analysis\n\nModelShow represents state-of-the-art in AI model comparison, combining rigorous methodology with practical usability for both casual exploration and professional evaluation."
      }
    ],
    "body": "ModelShow — Professional Multi-Model Evaluation\n\nModelShow provides a sophisticated framework for comparing AI model responses through double-blind evaluation. The system queries multiple models in parallel, anonymizes their outputs, and uses an independent judge model to rank responses purely on merit.\n\nKey Features\nArchitecturally Guaranteed De-anonymization: The judge sub-agent automatically de-anonymizes results before returning them—orchestrators never see placeholder labels\nCryptographic Randomization: Responses are presented to the judge in cryptographically secure random order using secrets.SystemRandom()\nHolistic Judge Analysis: Judges provide both per-model rankings and comprehensive \"Overall Assessment\" analyzing cross-model patterns\nIntelligent Polling: Automatic progress monitoring with content-free status updates and immediate completion detection\nProfessional Output: Formatted results with scores, judge commentary, and actionable insights\nDetection\n\nTrigger: Message starts with mdls or modelshow (case-insensitive). Extract the prompt by removing the trigger keyword.\n\nExample: mdls explain quantum entanglement → prompt = explain quantum entanglement\n\nWorkflow\nStep 1  → Acknowledge & Load Configuration\nStep 2  → Spawn Parallel Model Agents\nStep 3  → Collect Responses with Intelligent Polling\nStep 4  → Anonymize with Cryptographic Randomization\nStep 5  → Spawn Judge+Deanon Sub-Agent\nStep 6  → Parse De-anonymized Results\nStep 7  → Build Formatted Output\nStep 8  → Save Results (optionally update web index via update_modelshow_index.py)\n\nStep 1: Acknowledge & Load Configuration\n\nImmediate Response:\n\n🔄 ModelShow starting — querying models in parallel.\nResults will appear automatically when judging is complete.\n\n\nLoad Configuration: Read {baseDir}/config.json for model list, judge model, timeouts, and other settings.\n\nStep 2: Spawn Parallel Model Agents\n\nFor each model in config.models:\n\nModel: The model alias (e.g., pro, grok, kimi)\nLabel: mdls-{model}-{timestamp} (unique identifier)\nTimeout: config.timeoutSeconds (default: 360 seconds)\nTask:\n{config.systemPrompt}\n\n{extracted user prompt}\n\n\nParallel Execution: If config.parallel is true, spawn all agents simultaneously.\n\nContext Handling: If the prompt references external content (URLs, files, preferences), fetch and prepend this context to the task.\n\nStep 3: Collect Responses with Intelligent Polling\n\nPolling Strategy:\n\nPoll every 20 seconds\nExit immediately when all agents complete\nMinimum 3 polls before considering timeout\nMaximum runtime: config.timeoutSeconds\n\nStatus Updates (content-free):\n\n⏳ Models responding... {done}/{total} complete. ({elapsed}s elapsed)\n✅ All {N} models responded. Sending to judge...\n\nResponse Collection:\n\ncollected_responses = {\n  \"model_name\": {\n    \"status\": \"completed\" | \"failed\" | \"timeout\",\n    \"text\": \"response text or empty string\",\n    \"duration_seconds\": duration\n  }\n}\n\n\nMinimum Success Check: If successful responses < config.minSuccessful, abort with informative message.\n\nStep 4: Anonymize with Cryptographic Randomization\n\nExecute the anonymization pipeline:\n\necho '{\n  \"action\": \"anonymize\",\n  \"responses\": {model: response_dict},\n  \"label_style\": \"alphabetic\",\n  \"shuffle\": true\n}' | python3 {baseDir}/judge_pipeline.py\n\n\nKey Features:\n\nshuffle: true ensures cryptographically random response order\nLabels are assigned as \"Response A\", \"Response B\", etc.\nanonymization_map tracks label-to-model mapping for later de-anonymization\nStep 5: Spawn Judge+Deanon Sub-Agent\n\nThe judge sub-agent performs both evaluation and de-anonymization in a single atomic operation:\n\nJudge Task Structure:\n\nYou are an impartial judge AND a data processor.\n\nYour task has TWO parts. Complete BOTH before returning anything.\n\n═══════════════════════════════════════════════════════════\nPART 1: JUDGE THE RESPONSES\n═══════════════════════════════════════════════════════════\n\n[Blind responses with placeholder labels]\n\n═══════════════════════════════════════════════════════════\nPART 2: PROCESS YOUR JUDGMENT\n═══════════════════════════════════════════════════════════\n\n1. Write your judgment evaluating Response A, Response B, etc.\n2. Include scores (1-10) for each response\n3. Provide an \"Overall Assessment\" section analyzing cross-model patterns\n\nAfter writing your judgment, run this command:\n\necho '{\n  \"action\": \"finalize\",\n  \"judge_output\": \"[YOUR JUDGMENT TEXT HERE]\",\n  \"anonymization_map\": {anonymization_map}\n}' | python3 {baseDir}/judge_pipeline.py\n\nReturn ONLY the JSON output from that command.\n\n\nJudge Model: Uses config.judgeModel (e.g. sonnet, gemini31or)\n\nStep 6: Parse De-anonymized Results\n\nThe judge sub-agent returns:\n\ndeanonymized_judge_output: Full judgment with real model names\nranked_models_deanonymized: Structured ranking data\ndeanonymization_complete: Boolean verification\n\nArchitectural Guarantee: The orchestrator never receives placeholder labels—only de-anonymized results.\n\nStep 7: Build Formatted Output\n\nCreate professional presentation:\n\n🕶️ Double-Blind Judging Results:\n\n🏆 Model Name (Score: X.X/10)\n[Full response text]\nJudge's assessment: [Commentary]\n\n🥈 Second Place (Score: X.X/10)\n[Full response text]\nJudge's assessment: [Commentary]\n\n📊 Overall Assessment:\n[Judge's holistic analysis of cross-model patterns]\n\n\n⚠️ WARNING: Step 8 is MANDATORY. Sending results to the user is NOT the end of the task. You MUST execute Step 8 (save_results) immediately after sending. The task is NOT complete until save_results.py returns {\"success\": true}. Do NOT skip, defer, or omit Step 8 under any circumstances.\n\nStep 8: Save Results\n\n🚨 THIS STEP IS REQUIRED. DO NOT SKIP. After every ModelShow run, you MUST save the results using save_results.py. Failure to save means the results are not persisted for later use.\n\nSave to config.outputDir (default: ~/.openclaw/workspace/modelshow-results):\n\nJSON: {config.outputDir}/{slug}-{timestamp}.json\nMarkdown: {config.outputDir}/{slug}-{timestamp}.md\n\nExact JSON payload to pipe into save_results.py:\n\n{\n  \"prompt\": \"<the original user prompt>\",\n  \"timestamp\": \"<ISO 8601 timestamp, e.g. 2026-03-08T01:00:00Z>\",\n  \"models\": [\"model1\", \"model2\", \"model3\"],\n  \"judge_model\": \"<config.judgeModel>\",\n  \"output_dir\": \"<config.outputDir>\",\n  \"ranked_results\": [\n    {\n      \"rank\": 1,\n      \"model\": \"model_alias\",\n      \"score\": 9.5,\n      \"judge_notes\": \"Judge's per-model commentary here\",\n      \"response_text\": \"The full model response text here\"\n    },\n    {\n      \"rank\": 2,\n      \"model\": \"model_alias\",\n      \"score\": 8.0,\n      \"judge_notes\": \"Judge's per-model commentary here\",\n      \"response_text\": \"The full model response text here\"\n    }\n  ],\n  \"deanonymized_judge_output\": \"<full judge output text with real model names>\",\n  \"anonymization_map\": {\n    \"Response A\": \"model_alias_1\",\n    \"Response B\": \"model_alias_2\"\n  },\n  \"metadata\": {\n    \"total_duration_ms\": 45000,\n    \"successful_models\": 4,\n    \"failed_models\": 0,\n    \"timed_out_models\": [\"deepseek\"]\n  }\n}\n\n\nExecute the save command:\n\necho '<JSON payload above>' | python3 {baseDir}/save_results.py\n\n\nVerify success: The script MUST return {\"success\": true, ...}. If it returns an error, fix and retry. Do NOT proceed without a successful save.\n\nOptional: For building a local index of result files (e.g. for a custom dashboard or static site) or for web display (e.g. rexuvia.com), see update_modelshow_index.py. This is not part of the mandatory workflow.\n\n✅ Only after save_results.py returns success is the ModelShow task complete.\n\nConfiguration (config.json)\nKey\tDescription\tDefault\nkeyword\tPrimary trigger\t\"mdls\"\nalternativeKeywords\tAlso trigger on\t[\"modelshow\"]\nmodels\tList of model aliases to compare\t[\"pro\", \"sonnet\", \"deepseek\", \"gpt4\", \"grok\", \"kimi\"]\njudgeModel\tModel for double-blind evaluation\t\"sonnet\"\noutputDir\tWhere to save result files\t\"~/.openclaw/workspace/modelshow-results\"\ntimeoutSeconds\tMaximum wait time per model\t360\nminSuccessful\tMinimum responses to proceed\t2\nparallel\tRun models in parallel\ttrue\nshowTopN\tNumber of top results to display\t10\nincludeResponseText\tInclude full responses in output\ttrue\nblindJudging\tEnable anonymization\ttrue\nblindJudgingLabels\tLabel style for anonymization\t\"alphabetic\"\nshuffleBlindOrder\tRandomize response order\ttrue\nFile Structure\nmodelshow/\n├── SKILL.md              # This documentation\n├── config.json           # Configuration settings\n├── judge_pipeline.py     # Anonymization & de-anonymization pipeline\n├── save_results.py       # Result saving with holistic assessment extraction\n├── update_modelshow_index.py # Optional: build local index / web index\n├── blind_judge_manager.py # Anonymization utility (legacy)\n├── README.md             # User documentation\n└── .gitignore            # Git exclusions\n\nScripts\njudge_pipeline.py\n\nCore pipeline for anonymization and de-anonymization:\n\naction: \"anonymize\": Creates cryptographically randomized blind responses\naction: \"finalize\": De-anonymizes judge output and extracts rankings\nsave_results.py\n\nSaves results in both JSON and Markdown formats with specialized extraction of the \"Overall Assessment\" section from judge output. Results are written to config.outputDir for local use, scripting, or your own tooling.\n\nupdate_modelshow_index.py\n\nOptional utility to build a local index of result JSON files (e.g. for a custom dashboard or static site) or to update the web index for rexuvia.com. Not required for the core workflow.\n\nUsage Examples\n\nBasic Comparison:\n\nmdls explain the difference between TCP and UDP\n\n\nCreative Task:\n\nmdls write a short poem about working late at night\n\n\nTechnical Analysis:\n\nmdls pros and cons of event sourcing vs traditional CRUD\n\n\nCode Review:\n\nmdls review this Python function for potential issues: [code]\n\nBest Practices\nPrompt Clarity: Provide clear, specific prompts for meaningful comparisons\nModel Selection: Choose models with complementary strengths for the task type\nContext Inclusion: Reference relevant context when appropriate\nResult Interpretation: Consider both scores and the judge's holistic assessment\nTailor config: Update config.json to match the models available on your instance\nWeb Integration: Optionally use update_modelshow_index.py to publish results\nIntegration Points\nLocal storage: Results are saved as JSON and Markdown in config.outputDir for local use, scripting, or your own tooling\nWeb display: Use update_modelshow_index.py to make results available online\nCron Automation: Can be scheduled for regular comparative analysis\nAPI Access: JSON results enable programmatic analysis\n\nModelShow represents state-of-the-art in AI model comparison, combining rigorous methodology with practical usability for both casual exploration and professional evaluation."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/schbz/modelshow",
    "publisherUrl": "https://clawhub.ai/schbz/modelshow",
    "owner": "schbz",
    "version": "1.0.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/modelshow",
    "downloadUrl": "https://openagent3.xyz/downloads/modelshow",
    "agentUrl": "https://openagent3.xyz/skills/modelshow/agent",
    "manifestUrl": "https://openagent3.xyz/skills/modelshow/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/modelshow/agent.md"
  }
}