{
  "schemaVersion": "1.0",
  "item": {
    "slug": "prompt-performance-tester",
    "name": "Prompt Performance Tester - UnisAI",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/vedantsingh60/prompt-performance-tester",
    "canonicalUrl": "https://clawhub.ai/vedantsingh60/prompt-performance-tester",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/prompt-performance-tester",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=prompt-performance-tester",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "LICENSE.md",
      "manifest.yaml",
      "SKILL.md",
      "prompt_performance_tester.py"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/prompt-performance-tester"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/prompt-performance-tester",
    "agentPageUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent",
    "manifestUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Prompt Performance Tester",
        "body": "Model-agnostic prompt benchmarking across 9 providers.\n\nPass any model ID — provider auto-detected. Compare latency, cost, quality, and consistency across Claude, GPT, Gemini, DeepSeek, Grok, MiniMax, Qwen, Llama, and Mistral."
      },
      {
        "title": "Problem Statement",
        "body": "Comparing LLM models across providers requires manual testing:\n\nNo systematic way to measure performance across models\nCost differences are significant but not easily comparable\nQuality varies by use case and provider\nManual API testing is time-consuming and error-prone"
      },
      {
        "title": "The Solution",
        "body": "Test prompts across any model from any supported provider simultaneously. Get performance metrics and recommendations based on latency, cost, and quality."
      },
      {
        "title": "Example Cost Comparison",
        "body": "For 10,000 requests/day with average 28 input + 115 output tokens:\n\nClaude Opus 4.6: ~$30.15/day ($903/month)\nGemini 2.5 Flash-Lite: ~$0.05/day ($1.50/month)\nDeepSeek Chat: ~$0.14/day ($4.20/month)\nMonthly cost difference (Opus vs Flash-Lite): $901.50"
      },
      {
        "title": "Model-Agnostic Multi-Provider Testing",
        "body": "Pass any model ID — provider is auto-detected from the model name prefix.\nNo hardcoded list; new models work without code changes.\n\nProviderExample ModelsPrefixRequired KeyAnthropicclaude-opus-4-6, claude-sonnet-4-6, claude-haiku-4-5-20251001claude-ANTHROPIC_API_KEYOpenAIgpt-5.2-pro, gpt-5.2, gpt-5.1gpt-, o1, o3OPENAI_API_KEYGooglegemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-litegemini-GOOGLE_API_KEYMistralmistral-large-latest, mistral-small-latestmistral-, mixtral-MISTRAL_API_KEYDeepSeekdeepseek-chat, deepseek-reasonerdeepseek-DEEPSEEK_API_KEYxAIgrok-4-1-fast, grok-3-betagrok-XAI_API_KEYMiniMaxMiniMax-M2.1MiniMax, minimaxMINIMAX_API_KEYQwenqwen3.5-plus, qwen3-max-instructqwenDASHSCOPE_API_KEYMeta Llamameta-llama/llama-4-maverick, meta-llama/llama-3.3-70b-instructmeta-llama/, llama-OPENROUTER_API_KEY"
      },
      {
        "title": "Known Pricing (per 1M tokens)",
        "body": "ModelInputOutputclaude-opus-4-6$15.00$75.00claude-sonnet-4-6$3.00$15.00claude-haiku-4-5-20251001$1.00$5.00gpt-5.2-pro$21.00$168.00gpt-5.2$1.75$14.00gpt-5.1$2.00$8.00gemini-2.5-pro$1.25$10.00gemini-2.5-flash$0.30$2.50gemini-2.5-flash-lite$0.10$0.40mistral-large-latest$2.00$6.00mistral-small-latest$0.10$0.30deepseek-chat$0.27$1.10deepseek-reasoner$0.55$2.19grok-4-1-fast$5.00$25.00grok-3-beta$3.00$15.00MiniMax-M2.1$0.40$1.60qwen3.5-plus$0.57$2.29qwen3-max-instruct$1.60$6.40meta-llama/llama-4-maverick$0.20$0.60meta-llama/llama-3.3-70b-instruct$0.59$0.79\n\nNote: Unlisted models still work — cost calculation returns $0.00 with a warning. Pricing table is for reference only, not a validation gate."
      },
      {
        "title": "Performance Metrics",
        "body": "Every test measures:\n\n⚡ Latency — Response time in milliseconds\n💰 Cost — Exact API cost per request (input + output tokens)\n🎯 Quality — Response quality score (0–100)\n📊 Token Usage — Input and output token counts\n🔄 Consistency — Variance across multiple test runs\n❌ Error Tracking — API failures, timeouts, rate limits"
      },
      {
        "title": "Smart Recommendations",
        "body": "Get instant answers to:\n\nWhich model is fastest for your prompt?\nWhich is most cost-effective?\nWhich produces best quality responses?\nHow much can you save by switching providers?"
      },
      {
        "title": "📊 Real-World Example",
        "body": "PROMPT: \"Write a professional customer service response about a delayed shipment\"\n\n┌─────────────────────────────────────────────────────────────────┐\n│ GEMINI 2.5 FLASH-LITE (Google) 💰 MOST AFFORDABLE              │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  523ms                                                 │\n│ Cost:     $0.000025                                             │\n│ Quality:  65/100                                                │\n│ Tokens:   28 in / 87 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ DEEPSEEK CHAT (DeepSeek) 💡 BUDGET PICK                        │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  710ms                                                 │\n│ Cost:     $0.000048                                             │\n│ Quality:  70/100                                                │\n│ Tokens:   28 in / 92 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ CLAUDE HAIKU 4.5 (Anthropic) 🚀 BALANCED PERFORMER             │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  891ms                                                 │\n│ Cost:     $0.000145                                             │\n│ Quality:  78/100                                                │\n│ Tokens:   28 in / 102 out                                       │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ GPT-5.2 (OpenAI) 💡 EXCELLENT QUALITY                          │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  645ms                                                 │\n│ Cost:     $0.000402                                             │\n│ Quality:  88/100                                                │\n│ Tokens:   28 in / 98 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ CLAUDE OPUS 4.6 (Anthropic) 🏆 HIGHEST QUALITY                 │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  1,234ms                                               │\n│ Cost:     $0.001875                                             │\n│ Quality:  94/100                                                │\n│ Tokens:   28 in / 125 out                                       │\n└─────────────────────────────────────────────────────────────────┘\n\n🎯 RECOMMENDATIONS:\n1. Most cost-effective: Gemini 2.5 Flash-Lite ($0.000025/request) — 99.98% cheaper than Opus\n2. Budget pick: DeepSeek Chat ($0.000048/request) — strong quality at low cost\n3. Best quality: Claude Opus 4.6 (94/100) — state-of-the-art reasoning & analysis\n4. Smart pick: Claude Haiku 4.5 ($0.000145/request) — 81% cheaper, 83% quality match\n5. Speed + Quality: GPT-5.2 ($0.000402/request) — excellent quality at mid-range cost\n\n💡 Potential monthly savings (10,000 requests/day, 28 input + 115 output tokens avg):\n   - Using Gemini 2.5 Flash-Lite vs Opus: $903/month saved ($1.44 vs $904.50)\n   - Using DeepSeek Chat vs Opus: $899/month saved ($4.50 vs $904.50)\n   - Using Claude Haiku vs Opus: $731/month saved ($173.40 vs $904.50)"
      },
      {
        "title": "Production Deployment",
        "body": "Evaluate models before production selection\nCompare cost vs quality tradeoffs\nBenchmark API latency across providers"
      },
      {
        "title": "Prompt Development",
        "body": "Test prompt variations across models\nMeasure quality scores consistently\nCompare performance metrics"
      },
      {
        "title": "Cost Analysis",
        "body": "Analyze LLM API spending by model\nCompare provider pricing structures\nIdentify cost-efficient alternatives"
      },
      {
        "title": "Performance Testing",
        "body": "Measure latency and response times\nTest consistency across multiple runs\nEvaluate quality scores"
      },
      {
        "title": "1. Subscribe to Skill",
        "body": "Click \"Subscribe\" on ClawhHub to get access."
      },
      {
        "title": "2. Set API Keys",
        "body": "Add keys for the providers you want to test:\n\n# Anthropic (Claude models)\nexport ANTHROPIC_API_KEY=\"sk-ant-...\"\n\n# OpenAI (GPT models)\nexport OPENAI_API_KEY=\"sk-...\"\n\n# Google (Gemini models)\nexport GOOGLE_API_KEY=\"AI...\"\n\n# DeepSeek\nexport DEEPSEEK_API_KEY=\"...\"\n\n# xAI (Grok models)\nexport XAI_API_KEY=\"...\"\n\n# MiniMax\nexport MINIMAX_API_KEY=\"...\"\n\n# Alibaba (Qwen models)\nexport DASHSCOPE_API_KEY=\"...\"\n\n# OpenRouter (Meta Llama models)\nexport OPENROUTER_API_KEY=\"...\"\n\n# Mistral\nexport MISTRAL_API_KEY=\"...\"\n\nYou only need keys for the providers you plan to test."
      },
      {
        "title": "3. Install Dependencies",
        "body": "# Install only what you need\npip install anthropic          # Claude\npip install openai             # GPT, DeepSeek, xAI, MiniMax, Qwen, Llama\npip install google-generativeai  # Gemini\npip install mistralai          # Mistral\n\n# Or install everything\npip install anthropic openai google-generativeai mistralai"
      },
      {
        "title": "4. Run Your First Test",
        "body": "Option A: Python\n\nimport os\nfrom prompt_performance_tester import PromptPerformanceTester\n\ntester = PromptPerformanceTester()  # reads API keys from environment\n\nresults = tester.test_prompt(\n    prompt_text=\"Write a professional email apologizing for a delayed shipment\",\n    models=[\n        \"claude-haiku-4-5-20251001\",\n        \"gpt-5.2\",\n        \"gemini-2.5-flash\",\n        \"deepseek-chat\",\n    ],\n    num_runs=3,\n    max_tokens=500\n)\n\nprint(tester.format_results(results))\nprint(f\"🏆 Best quality:  {results.best_model}\")\nprint(f\"💰 Cheapest:      {results.cheapest_model}\")\nprint(f\"⚡ Fastest:       {results.fastest_model}\")\n\nOption B: CLI\n\n# Test across multiple models\nprompt-tester test \"Your prompt here\" \\\n  --models claude-haiku-4-5-20251001 gpt-5.2 gemini-2.5-flash deepseek-chat \\\n  --runs 3\n\n# Export results\nprompt-tester test \"Your prompt here\" --export results.json"
      },
      {
        "title": "API Key Safety",
        "body": "Keys stored in environment variables only — never hardcoded or logged\nNever transmitted to UnisAI servers\nHTTPS encryption for all provider API calls"
      },
      {
        "title": "Data Privacy",
        "body": "Your prompts are sent only to the AI providers you select for testing\nEach provider has their own data retention policy (see their privacy pages)\nNo data stored on UnisAI infrastructure"
      },
      {
        "title": "System Requirements",
        "body": "Python: 3.9+\nDependencies: anthropic, openai, google-generativeai, mistralai (install only what you need)\nPlatform: macOS, Linux, Windows"
      },
      {
        "title": "Architecture",
        "body": "Lazy client initialization — SDK clients only loaded for providers actually tested\nPrefix-based routing — PROVIDER_MAP detects provider from model name; no hardcoded whitelist\nOpenAI-compat path — DeepSeek, xAI, MiniMax, Qwen, and OpenRouter all use the openai SDK with a custom base_url\nPricing table — used for cost calculation only; unknown models get cost=0 with a warning"
      },
      {
        "title": "Metrics Collected",
        "body": "Every test captures:\n\nLatency: Total response time (ms)\nCost: Input + output cost based on known pricing (USD)\nQuality: Heuristic response score based on length, completeness (0–100)\nTokens: Exact input/output token counts per provider\nConsistency: Standard deviation across multiple runs\nErrors: Timeouts, rate limits, API failures"
      },
      {
        "title": "❓ Frequently Asked Questions",
        "body": "Q: Do I need API keys for all 9 providers?\nA: No. You only need keys for the providers you want to test. If you only test Claude models, you only need ANTHROPIC_API_KEY.\n\nQ: Who pays for the API costs?\nA: You do. You provide your own API keys and pay each provider directly. This skill has no per-request fees.\n\nQ: How accurate are the cost calculations?\nA: Costs are calculated from the known pricing table using actual token counts. Models not in the pricing table return $0.00 — the model still runs, the cost just won't be shown.\n\nQ: Can I test models not in the pricing table?\nA: Yes. Any model whose name starts with a supported prefix will run. Cost will show as $0.00 for unlisted models.\n\nQ: Can I test prompts in non-English languages?\nA: Yes. All supported providers handle multiple languages.\n\nQ: Can I use this in production/CI/CD?\nA: Yes. Import PromptPerformanceTester directly from Python or call via CLI.\n\nQ: What if my prompt is very long?\nA: Set max_tokens appropriately. The skill passes your prompt as-is to each provider's API."
      },
      {
        "title": "✅ Current Release (v1.1.8)",
        "body": "Model-agnostic architecture — any model ID works via prefix detection\n9 providers, 20 known models with pricing\nDeepSeek, xAI Grok, MiniMax, Qwen, Meta Llama as first-class providers\nClaude 4.6 series (opus-4-6, sonnet-4-6)\nLazy client initialization — only loads SDKs for providers actually used\nFixed UnisAI branding throughout"
      },
      {
        "title": "🚧 Coming Soon (v1.3)",
        "body": "Batch testing: Test 100+ prompts simultaneously\nHistorical tracking: Track model performance over time\nWebhook integrations: Slack, Discord, email notifications"
      },
      {
        "title": "🔮 Future (v1.3+)",
        "body": "A/B testing framework: Scientific prompt experimentation\nFine-tuning insights: Which models to fine-tune for your use case\nCustom benchmarks: Create your own evaluation criteria\nAuto-optimization: AI-powered prompt improvement suggestions"
      },
      {
        "title": "📞 Support",
        "body": "Email: support@unisai.vercel.app\nWebsite: https://unisai.vercel.app\nBug Reports: support@unisai.vercel.app"
      },
      {
        "title": "📄 License & Terms",
        "body": "This skill is distributed via ClawhHub under the following terms."
      },
      {
        "title": "✅ You CAN:",
        "body": "Use for your own business and projects\nTest prompts for internal applications\nModify source code for personal use"
      },
      {
        "title": "❌ You CANNOT:",
        "body": "Redistribute outside ClawhHub registry\nResell or sublicense\nUse UnisAI trademark without permission\n\nFull Terms: See LICENSE.md"
      },
      {
        "title": "[1.1.8] - 2026-02-27",
        "body": "Fixes & Polish\n\nBumped version to 1.1.8\nSKILL.md fully rewritten — cleaned up formatting, removed stale content\nRemoved old IP watermark reference (PROPRIETARY_SKILL_VEDANT_2024) from docs\nCorrected watermark to PROPRIETARY_SKILL_UNISAI_2026_MULTI_PROVIDER throughout\nFixed all UnisAI branding (was UniAI in v1.1.0 changelog)\nUpdated pricing table to include all 20 known models\nCleaned up FAQ, Quick Start, and Use Cases sections"
      },
      {
        "title": "[1.1.6] - 2026-02-27",
        "body": "🏗️ Model-Agnostic Architecture\n\nProvider auto-detected from model name prefix — no hardcoded whitelist\nAny new model works automatically without code changes\nAdded DeepSeek, xAI Grok, MiniMax, Qwen, Meta Llama as first-class providers (9 total)\nUpdated Claude to 4.6 series (claude-opus-4-6, claude-sonnet-4-6)\nLazy client initialization — only loads SDKs for providers actually tested\nUnified OpenAI-compat path for DeepSeek, xAI, MiniMax, Qwen, OpenRouter"
      },
      {
        "title": "[1.1.5] - 2026-02-01",
        "body": "🚀 Latest Models Update\n\nGPT-5.2 Series — Added Instant, Thinking, and Pro variants\nGemini 2.5 Series — Updated to 2.5 Pro, Flash, and Flash-Lite\nClaude 4.5 pricing updates\n10 total models across 3 providers"
      },
      {
        "title": "[1.1.0] - 2026-01-15",
        "body": "✨ Major Features\n\nMulti-provider support — Claude, GPT, Gemini\nCross-provider cost comparison\nEnhanced recommendations engine\nRebranded to UnisAI"
      },
      {
        "title": "[1.0.0] - 2024-02-02",
        "body": "Initial Release\n\nClaude-only prompt testing (Haiku, Sonnet, Opus)\nPerformance metrics: latency, cost, quality, consistency\nBasic recommendations engine\n\nLast Updated: February 27, 2026\nCurrent Version: 1.1.8\nStatus: Active & Maintained\n\n© 2026 UnisAI. All rights reserved."
      }
    ],
    "body": "Prompt Performance Tester\n\nModel-agnostic prompt benchmarking across 9 providers.\n\nPass any model ID — provider auto-detected. Compare latency, cost, quality, and consistency across Claude, GPT, Gemini, DeepSeek, Grok, MiniMax, Qwen, Llama, and Mistral.\n\n🚀 Why This Skill?\nProblem Statement\n\nComparing LLM models across providers requires manual testing:\n\nNo systematic way to measure performance across models\nCost differences are significant but not easily comparable\nQuality varies by use case and provider\nManual API testing is time-consuming and error-prone\nThe Solution\n\nTest prompts across any model from any supported provider simultaneously. Get performance metrics and recommendations based on latency, cost, and quality.\n\nExample Cost Comparison\n\nFor 10,000 requests/day with average 28 input + 115 output tokens:\n\nClaude Opus 4.6: ~$30.15/day ($903/month)\nGemini 2.5 Flash-Lite: ~$0.05/day ($1.50/month)\nDeepSeek Chat: ~$0.14/day ($4.20/month)\nMonthly cost difference (Opus vs Flash-Lite): $901.50\n✨ What You Get\nModel-Agnostic Multi-Provider Testing\n\nPass any model ID — provider is auto-detected from the model name prefix. No hardcoded list; new models work without code changes.\n\nProvider\tExample Models\tPrefix\tRequired Key\nAnthropic\tclaude-opus-4-6, claude-sonnet-4-6, claude-haiku-4-5-20251001\tclaude-\tANTHROPIC_API_KEY\nOpenAI\tgpt-5.2-pro, gpt-5.2, gpt-5.1\tgpt-, o1, o3\tOPENAI_API_KEY\nGoogle\tgemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite\tgemini-\tGOOGLE_API_KEY\nMistral\tmistral-large-latest, mistral-small-latest\tmistral-, mixtral-\tMISTRAL_API_KEY\nDeepSeek\tdeepseek-chat, deepseek-reasoner\tdeepseek-\tDEEPSEEK_API_KEY\nxAI\tgrok-4-1-fast, grok-3-beta\tgrok-\tXAI_API_KEY\nMiniMax\tMiniMax-M2.1\tMiniMax, minimax\tMINIMAX_API_KEY\nQwen\tqwen3.5-plus, qwen3-max-instruct\tqwen\tDASHSCOPE_API_KEY\nMeta Llama\tmeta-llama/llama-4-maverick, meta-llama/llama-3.3-70b-instruct\tmeta-llama/, llama-\tOPENROUTER_API_KEY\nKnown Pricing (per 1M tokens)\nModel\tInput\tOutput\nclaude-opus-4-6\t$15.00\t$75.00\nclaude-sonnet-4-6\t$3.00\t$15.00\nclaude-haiku-4-5-20251001\t$1.00\t$5.00\ngpt-5.2-pro\t$21.00\t$168.00\ngpt-5.2\t$1.75\t$14.00\ngpt-5.1\t$2.00\t$8.00\ngemini-2.5-pro\t$1.25\t$10.00\ngemini-2.5-flash\t$0.30\t$2.50\ngemini-2.5-flash-lite\t$0.10\t$0.40\nmistral-large-latest\t$2.00\t$6.00\nmistral-small-latest\t$0.10\t$0.30\ndeepseek-chat\t$0.27\t$1.10\ndeepseek-reasoner\t$0.55\t$2.19\ngrok-4-1-fast\t$5.00\t$25.00\ngrok-3-beta\t$3.00\t$15.00\nMiniMax-M2.1\t$0.40\t$1.60\nqwen3.5-plus\t$0.57\t$2.29\nqwen3-max-instruct\t$1.60\t$6.40\nmeta-llama/llama-4-maverick\t$0.20\t$0.60\nmeta-llama/llama-3.3-70b-instruct\t$0.59\t$0.79\n\nNote: Unlisted models still work — cost calculation returns $0.00 with a warning. Pricing table is for reference only, not a validation gate.\n\nPerformance Metrics\n\nEvery test measures:\n\n⚡ Latency — Response time in milliseconds\n💰 Cost — Exact API cost per request (input + output tokens)\n🎯 Quality — Response quality score (0–100)\n📊 Token Usage — Input and output token counts\n🔄 Consistency — Variance across multiple test runs\n❌ Error Tracking — API failures, timeouts, rate limits\nSmart Recommendations\n\nGet instant answers to:\n\nWhich model is fastest for your prompt?\nWhich is most cost-effective?\nWhich produces best quality responses?\nHow much can you save by switching providers?\n📊 Real-World Example\nPROMPT: \"Write a professional customer service response about a delayed shipment\"\n\n┌─────────────────────────────────────────────────────────────────┐\n│ GEMINI 2.5 FLASH-LITE (Google) 💰 MOST AFFORDABLE              │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  523ms                                                 │\n│ Cost:     $0.000025                                             │\n│ Quality:  65/100                                                │\n│ Tokens:   28 in / 87 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ DEEPSEEK CHAT (DeepSeek) 💡 BUDGET PICK                        │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  710ms                                                 │\n│ Cost:     $0.000048                                             │\n│ Quality:  70/100                                                │\n│ Tokens:   28 in / 92 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ CLAUDE HAIKU 4.5 (Anthropic) 🚀 BALANCED PERFORMER             │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  891ms                                                 │\n│ Cost:     $0.000145                                             │\n│ Quality:  78/100                                                │\n│ Tokens:   28 in / 102 out                                       │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ GPT-5.2 (OpenAI) 💡 EXCELLENT QUALITY                          │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  645ms                                                 │\n│ Cost:     $0.000402                                             │\n│ Quality:  88/100                                                │\n│ Tokens:   28 in / 98 out                                        │\n└─────────────────────────────────────────────────────────────────┘\n\n┌─────────────────────────────────────────────────────────────────┐\n│ CLAUDE OPUS 4.6 (Anthropic) 🏆 HIGHEST QUALITY                 │\n├─────────────────────────────────────────────────────────────────┤\n│ Latency:  1,234ms                                               │\n│ Cost:     $0.001875                                             │\n│ Quality:  94/100                                                │\n│ Tokens:   28 in / 125 out                                       │\n└─────────────────────────────────────────────────────────────────┘\n\n🎯 RECOMMENDATIONS:\n1. Most cost-effective: Gemini 2.5 Flash-Lite ($0.000025/request) — 99.98% cheaper than Opus\n2. Budget pick: DeepSeek Chat ($0.000048/request) — strong quality at low cost\n3. Best quality: Claude Opus 4.6 (94/100) — state-of-the-art reasoning & analysis\n4. Smart pick: Claude Haiku 4.5 ($0.000145/request) — 81% cheaper, 83% quality match\n5. Speed + Quality: GPT-5.2 ($0.000402/request) — excellent quality at mid-range cost\n\n💡 Potential monthly savings (10,000 requests/day, 28 input + 115 output tokens avg):\n   - Using Gemini 2.5 Flash-Lite vs Opus: $903/month saved ($1.44 vs $904.50)\n   - Using DeepSeek Chat vs Opus: $899/month saved ($4.50 vs $904.50)\n   - Using Claude Haiku vs Opus: $731/month saved ($173.40 vs $904.50)\n\nUse Cases\nProduction Deployment\nEvaluate models before production selection\nCompare cost vs quality tradeoffs\nBenchmark API latency across providers\nPrompt Development\nTest prompt variations across models\nMeasure quality scores consistently\nCompare performance metrics\nCost Analysis\nAnalyze LLM API spending by model\nCompare provider pricing structures\nIdentify cost-efficient alternatives\nPerformance Testing\nMeasure latency and response times\nTest consistency across multiple runs\nEvaluate quality scores\n🚀 Quick Start\n1. Subscribe to Skill\n\nClick \"Subscribe\" on ClawhHub to get access.\n\n2. Set API Keys\n\nAdd keys for the providers you want to test:\n\n# Anthropic (Claude models)\nexport ANTHROPIC_API_KEY=\"sk-ant-...\"\n\n# OpenAI (GPT models)\nexport OPENAI_API_KEY=\"sk-...\"\n\n# Google (Gemini models)\nexport GOOGLE_API_KEY=\"AI...\"\n\n# DeepSeek\nexport DEEPSEEK_API_KEY=\"...\"\n\n# xAI (Grok models)\nexport XAI_API_KEY=\"...\"\n\n# MiniMax\nexport MINIMAX_API_KEY=\"...\"\n\n# Alibaba (Qwen models)\nexport DASHSCOPE_API_KEY=\"...\"\n\n# OpenRouter (Meta Llama models)\nexport OPENROUTER_API_KEY=\"...\"\n\n# Mistral\nexport MISTRAL_API_KEY=\"...\"\n\n\nYou only need keys for the providers you plan to test.\n\n3. Install Dependencies\n# Install only what you need\npip install anthropic          # Claude\npip install openai             # GPT, DeepSeek, xAI, MiniMax, Qwen, Llama\npip install google-generativeai  # Gemini\npip install mistralai          # Mistral\n\n# Or install everything\npip install anthropic openai google-generativeai mistralai\n\n4. Run Your First Test\n\nOption A: Python\n\nimport os\nfrom prompt_performance_tester import PromptPerformanceTester\n\ntester = PromptPerformanceTester()  # reads API keys from environment\n\nresults = tester.test_prompt(\n    prompt_text=\"Write a professional email apologizing for a delayed shipment\",\n    models=[\n        \"claude-haiku-4-5-20251001\",\n        \"gpt-5.2\",\n        \"gemini-2.5-flash\",\n        \"deepseek-chat\",\n    ],\n    num_runs=3,\n    max_tokens=500\n)\n\nprint(tester.format_results(results))\nprint(f\"🏆 Best quality:  {results.best_model}\")\nprint(f\"💰 Cheapest:      {results.cheapest_model}\")\nprint(f\"⚡ Fastest:       {results.fastest_model}\")\n\n\nOption B: CLI\n\n# Test across multiple models\nprompt-tester test \"Your prompt here\" \\\n  --models claude-haiku-4-5-20251001 gpt-5.2 gemini-2.5-flash deepseek-chat \\\n  --runs 3\n\n# Export results\nprompt-tester test \"Your prompt here\" --export results.json\n\n🔒 Security & Privacy\nAPI Key Safety\nKeys stored in environment variables only — never hardcoded or logged\nNever transmitted to UnisAI servers\nHTTPS encryption for all provider API calls\nData Privacy\nYour prompts are sent only to the AI providers you select for testing\nEach provider has their own data retention policy (see their privacy pages)\nNo data stored on UnisAI infrastructure\n📚 Technical Details\nSystem Requirements\nPython: 3.9+\nDependencies: anthropic, openai, google-generativeai, mistralai (install only what you need)\nPlatform: macOS, Linux, Windows\nArchitecture\nLazy client initialization — SDK clients only loaded for providers actually tested\nPrefix-based routing — PROVIDER_MAP detects provider from model name; no hardcoded whitelist\nOpenAI-compat path — DeepSeek, xAI, MiniMax, Qwen, and OpenRouter all use the openai SDK with a custom base_url\nPricing table — used for cost calculation only; unknown models get cost=0 with a warning\nMetrics Collected\n\nEvery test captures:\n\nLatency: Total response time (ms)\nCost: Input + output cost based on known pricing (USD)\nQuality: Heuristic response score based on length, completeness (0–100)\nTokens: Exact input/output token counts per provider\nConsistency: Standard deviation across multiple runs\nErrors: Timeouts, rate limits, API failures\n❓ Frequently Asked Questions\n\nQ: Do I need API keys for all 9 providers? A: No. You only need keys for the providers you want to test. If you only test Claude models, you only need ANTHROPIC_API_KEY.\n\nQ: Who pays for the API costs? A: You do. You provide your own API keys and pay each provider directly. This skill has no per-request fees.\n\nQ: How accurate are the cost calculations? A: Costs are calculated from the known pricing table using actual token counts. Models not in the pricing table return $0.00 — the model still runs, the cost just won't be shown.\n\nQ: Can I test models not in the pricing table? A: Yes. Any model whose name starts with a supported prefix will run. Cost will show as $0.00 for unlisted models.\n\nQ: Can I test prompts in non-English languages? A: Yes. All supported providers handle multiple languages.\n\nQ: Can I use this in production/CI/CD? A: Yes. Import PromptPerformanceTester directly from Python or call via CLI.\n\nQ: What if my prompt is very long? A: Set max_tokens appropriately. The skill passes your prompt as-is to each provider's API.\n\n🗺️ Roadmap\n✅ Current Release (v1.1.8)\nModel-agnostic architecture — any model ID works via prefix detection\n9 providers, 20 known models with pricing\nDeepSeek, xAI Grok, MiniMax, Qwen, Meta Llama as first-class providers\nClaude 4.6 series (opus-4-6, sonnet-4-6)\nLazy client initialization — only loads SDKs for providers actually used\nFixed UnisAI branding throughout\n🚧 Coming Soon (v1.3)\nBatch testing: Test 100+ prompts simultaneously\nHistorical tracking: Track model performance over time\nWebhook integrations: Slack, Discord, email notifications\n🔮 Future (v1.3+)\nA/B testing framework: Scientific prompt experimentation\nFine-tuning insights: Which models to fine-tune for your use case\nCustom benchmarks: Create your own evaluation criteria\nAuto-optimization: AI-powered prompt improvement suggestions\n📞 Support\nEmail: support@unisai.vercel.app\nWebsite: https://unisai.vercel.app\nBug Reports: support@unisai.vercel.app\n📄 License & Terms\n\nThis skill is distributed via ClawhHub under the following terms.\n\n✅ You CAN:\nUse for your own business and projects\nTest prompts for internal applications\nModify source code for personal use\n❌ You CANNOT:\nRedistribute outside ClawhHub registry\nResell or sublicense\nUse UnisAI trademark without permission\n\nFull Terms: See LICENSE.md\n\n📝 Changelog\n[1.1.8] - 2026-02-27\nFixes & Polish\nBumped version to 1.1.8\nSKILL.md fully rewritten — cleaned up formatting, removed stale content\nRemoved old IP watermark reference (PROPRIETARY_SKILL_VEDANT_2024) from docs\nCorrected watermark to PROPRIETARY_SKILL_UNISAI_2026_MULTI_PROVIDER throughout\nFixed all UnisAI branding (was UniAI in v1.1.0 changelog)\nUpdated pricing table to include all 20 known models\nCleaned up FAQ, Quick Start, and Use Cases sections\n[1.1.6] - 2026-02-27\n🏗️ Model-Agnostic Architecture\nProvider auto-detected from model name prefix — no hardcoded whitelist\nAny new model works automatically without code changes\nAdded DeepSeek, xAI Grok, MiniMax, Qwen, Meta Llama as first-class providers (9 total)\nUpdated Claude to 4.6 series (claude-opus-4-6, claude-sonnet-4-6)\nLazy client initialization — only loads SDKs for providers actually tested\nUnified OpenAI-compat path for DeepSeek, xAI, MiniMax, Qwen, OpenRouter\n[1.1.5] - 2026-02-01\n🚀 Latest Models Update\nGPT-5.2 Series — Added Instant, Thinking, and Pro variants\nGemini 2.5 Series — Updated to 2.5 Pro, Flash, and Flash-Lite\nClaude 4.5 pricing updates\n10 total models across 3 providers\n[1.1.0] - 2026-01-15\n✨ Major Features\nMulti-provider support — Claude, GPT, Gemini\nCross-provider cost comparison\nEnhanced recommendations engine\nRebranded to UnisAI\n[1.0.0] - 2024-02-02\nInitial Release\nClaude-only prompt testing (Haiku, Sonnet, Opus)\nPerformance metrics: latency, cost, quality, consistency\nBasic recommendations engine\n\nLast Updated: February 27, 2026 Current Version: 1.1.8 Status: Active & Maintained\n\n© 2026 UnisAI. All rights reserved."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/vedantsingh60/prompt-performance-tester",
    "publisherUrl": "https://clawhub.ai/vedantsingh60/prompt-performance-tester",
    "owner": "vedantsingh60",
    "version": "1.1.9",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/prompt-performance-tester",
    "downloadUrl": "https://openagent3.xyz/downloads/prompt-performance-tester",
    "agentUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent",
    "manifestUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/prompt-performance-tester/agent.md"
  }
}