{
  "schemaVersion": "1.0",
  "item": {
    "slug": "llm-eval-router",
    "name": "Llm Eval Router",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/nissan/llm-eval-router",
    "canonicalUrl": "https://clawhub.ai/nissan/llm-eval-router",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/llm-eval-router",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=llm-eval-router",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/llm-eval-router"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/llm-eval-router",
    "agentPageUrl": "https://openagent3.xyz/skills/llm-eval-router/agent",
    "manifestUrl": "https://openagent3.xyz/skills/llm-eval-router/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/llm-eval-router/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "llm-eval-router",
        "body": "Set up a production-quality shadow evaluation pipeline that automatically\npromotes local Ollama models when they statistically prove they match cloud\nmodel quality — reducing inference costs with evidence, not hope."
      },
      {
        "title": "The core idea",
        "body": "Run every task through your best local model (shadow) in parallel with your\ncloud baseline (ground truth). A lightweight judge ensemble scores the local\noutput. After 200+ runs, if the local model hits 0.95 mean score, promote it\nto handle that task type in production. Demote it automatically if quality drops."
      },
      {
        "title": "When to use",
        "body": "You're paying for Claude/GPT API calls on tasks that don't need that quality\nYou have Ollama running locally with capable models (qwen2.5, phi4, mistral, etc.)\nYou want evidence-based cost reduction, not blind routing\nYou have defined task types: summarize, classify, extract, format, analyze, RAG"
      },
      {
        "title": "When NOT to use",
        "body": "Tasks that require real-time web knowledge (use cloud)\nTasks with strict latency requirements < 2 seconds (local models on CPU are slow)\nTasks with high safety stakes (always use cloud with safety filters)\nYou don't have Ollama or a Mac/Linux machine with enough RAM (8GB+ per model)"
      },
      {
        "title": "Prerequisites",
        "body": "Ollama installed and running (ollama.com)\nAt least one capable model: ollama pull qwen2.5 or ollama pull phi4\nPython 3.10+\nAPI keys: Anthropic (ground truth) + OpenAI (judge) — Gemini optional (tiebreaker)\nLangfuse for observability (self-hosted or cloud) — optional but strongly recommended"
      },
      {
        "title": "Network & Privacy",
        "body": "This skill makes outbound API calls to:\n\nAnthropic API — to generate ground truth baseline responses (every accumulation cycle)\nOpenAI API — for judge scoring (sampled at 15% of runs)\nGoogle Gemini API — tiebreaker judge only (when primary judges disagree by ≥0.20)\n\nWhat stays local:\n\nAll Ollama model inference runs entirely on your device\nScored run data is stored on disk in data/scores/*.json\nNo telemetry, analytics, or data collection of any kind\nNo data is sent anywhere other than the explicit API calls above\n\nLangfuse (optional) can be self-hosted or cloud. If self-hosted, all observability data stays on your network."
      },
      {
        "title": "6-Dimension Evaluation",
        "body": "Every response is scored on:\n\nDimensionDefault weightAnalyze weightWhat it measuresStructural25%10%Format compliance, required keys presentSemantic25%40%Meaning equivalence to ground truthFactual20%25%No hallucinated facts/numbers/entitiesCompletion15%18%Task fully addressedTool use10%4%Correct tool/format selectionLatency5%3%Within acceptable bounds\n\nImportant: Use per-task weight overrides. The default 25/25 split treats structural\naccuracy equally with semantic similarity — which works for extract/classify/format tasks\n(where exact format matters) but is wrong for open-ended analysis. difflib.SequenceMatcher\non two prose analyses of the same question scores ~0.29 even when they're semantically\nidentical. With structural weight at 25%, this alone caps analyze scores at ~0.59.\n\n# src/evaluator.py — per-task weight profiles\nTASK_WEIGHT_OVERRIDES = {\n    \"analyze\": {\n        \"structural_accuracy\": 0.10,   # difflib is NOT meaningful for prose\n        \"semantic_similarity\": 0.40,   # cosine over embeddings captures meaning\n        \"factual_drift\": 0.25,\n        \"task_completion\": 0.18,\n        \"tool_correctness\": 0.04,\n        \"latency_score\": 0.03,\n    },\n    \"code_transform\": {\n        \"structural_accuracy\": 0.15,\n        \"semantic_similarity\": 0.35,\n        \"factual_drift\": 0.20,\n        \"task_completion\": 0.20,\n        \"tool_correctness\": 0.07,\n        \"latency_score\": 0.03,\n    },\n}\n\nAlso: For analyze tasks, constrain output structure via system_prompt so GT and\ncandidates produce comparably-formatted responses (Finding/Recommendation/Confidence/Reasoning).\nThis reduces Layer 2 drift and improves difflib scores even at reduced weight."
      },
      {
        "title": "Judge ensemble",
        "body": "Primary judges (15% sampling rate): Claude Sonnet + gpt-4o-mini score independently\nTiebreaker (only when |score_A - score_B| ≥ 0.20): Gemini 2.5-flash\nUnsampled runs (85%): Layer 1+2 validators only (deterministic, free)\nPromotion gates always trigger full judge evaluation regardless of sampling rate"
      },
      {
        "title": "Layer 1+2 validators (free, deterministic)",
        "body": "Layer 1: JSON validity, required key presence, forbidden pattern check\nLayer 2: Drift detection — novel entities/numbers/URLs not in ground truth\n\nThese run on every response at zero cost. Judges only run when L1+L2 pass and\nthe sampling rate triggers."
      },
      {
        "title": "Promotion / Demotion",
        "body": "Promote: 200+ runs, rolling mean ≥ 0.95 for a model/task pair\nDemote: rolling 7-day pass rate < 0.92\nControl floor: one model (phi4, granite4, or similar) serves as the measured floor —\nany model scoring below it should be flagged, not promoted"
      },
      {
        "title": "Step 1 — Define your task types",
        "body": "Create config/task_types.yaml:\n\ntasks:\n  - id: summarize\n    description: \"Summarize a document in N sentences\"\n    require_json: false\n    judge_dimensions: [semantic, factual, completion]\n\n  - id: classify\n    description: \"Classify text into one of N categories\"\n    require_json: true    # response must be valid JSON\n    judge_dimensions: [structural, semantic, completion]\n\n  - id: extract\n    description: \"Extract structured data from unstructured text\"\n    require_json: true\n    judge_dimensions: [structural, factual, completion]\n\n  - id: format\n    description: \"Reformat content to match a template\"\n    require_json: false\n    judge_dimensions: [structural, semantic, completion]"
      },
      {
        "title": "Step 2 — Set up the router",
        "body": "The router assigns each task to a model using a round-robin strategy during\nburn-in (building n), then switches to confidence-weighted routing after promotion.\n\n# src/router.py — simplified version\nclass Router:\n    def __init__(self, candidates: list[str], control_floor: str):\n        self.candidates = candidates\n        self.control_floor = control_floor\n        self._rr_counters = defaultdict(int)\n\n    def route(self, task_type: str, confidence_tracker: ConfidenceTracker) -> str:\n        \"\"\"Return the best model for this task type.\"\"\"\n        promoted = confidence_tracker.get_promoted(task_type)\n        if promoted:\n            return promoted  # use promoted model directly\n\n        # Round-robin during burn-in for fair exposure\n        idx = self._rr_counters[task_type] % len(self.candidates)\n        self._rr_counters[task_type] += 1\n        return self.candidates[idx]"
      },
      {
        "title": "Step 3 — Ground truth comparison",
        "body": "For each task, run it through BOTH the local model (candidate) and the cloud\nbaseline (ground truth). Never use the ground truth response in production —\nit's only for evaluation.\n\nasync def evaluate_pair(prompt: str, local_response: str, gt_response: str,\n                        task_type: str) -> float:\n    # Layer 1: deterministic\n    l1_score = validators.layer1(local_response, task_type)\n    if l1_score == 0.0:\n        return 0.0  # hard fail — safety or format violation\n\n    # Layer 2: heuristic drift\n    l2_score = validators.layer2(local_response, gt_response)\n\n    # Sample judges (15%)\n    if random.random() < JUDGE_SAMPLE_RATE:\n        sonnet_score = await judge_sonnet(prompt, local_response, gt_response)\n        mini_score = await judge_gpt4o_mini(prompt, local_response, gt_response)\n        if abs(sonnet_score - mini_score) >= 0.20:\n            gemini_score = await judge_gemini(prompt, local_response, gt_response)\n            final = median([sonnet_score, mini_score, gemini_score])\n        else:\n            final = (sonnet_score + mini_score) / 2\n        return weighted_score(l1_score, l2_score, final)\n    else:\n        return weighted_score(l1_score, l2_score, judge_score=None)"
      },
      {
        "title": "Step 4 — Confidence tracker",
        "body": "Track scores per model/task pair on disk (so restarts don't lose data):\n\n# src/scoring/confidence.py — simplified\n@dataclass\nclass ModelStats:\n    model_id: str\n    task_type: str\n    scores: list[float]   # all scores (None excluded)\n    promoted: bool = False\n    demoted: bool = False\n\n    @property\n    def mean(self) -> float:\n        return sum(self.scores) / len(self.scores) if self.scores else 0.0\n\n    @property\n    def n(self) -> int:\n        return len(self.scores)\n\n    def should_promote(self) -> bool:\n        return self.n >= 200 and self.mean >= 0.95 and not self.promoted\n\n    def should_demote(self) -> bool:\n        recent = self.scores[-50:]  # last 50\n        pass_rate = sum(1 for s in recent if s >= 0.85) / len(recent)\n        return pass_rate < 0.92 and not self.demoted"
      },
      {
        "title": "Step 5 — Accumulator loop",
        "body": "Run this on a cron (every 10-20 minutes via launchd/systemd):\n\n# run_accumulate.py\nasync def accumulate():\n    task_type = pick_next_task()  # round-robin across task types\n    prompt, gt_response = generate_task(task_type)  # call cloud baseline\n\n    for candidate in router.get_candidates(task_type):\n        local_response = await ollama_client.complete(candidate, prompt)\n        score = await evaluate_pair(prompt, local_response, gt_response, task_type)\n        confidence_tracker.record(candidate, task_type, score)\n\n        if confidence_tracker.should_promote(candidate, task_type):\n            router.promote(candidate, task_type)\n            langfuse.log_promotion(candidate, task_type, confidence_tracker.stats(candidate, task_type))"
      },
      {
        "title": "Step 6 — Routing policy",
        "body": "# config/routing_policy.yaml\ncontrol_floor_model: phi4:latest   # never promote below this model's score\n\ntask_policies:\n  policy_check_high_risk:\n    never_local: true              # these tasks always use cloud model\n\n  summarize:\n    min_score_for_routing: 0.85\n    fallback_chain: [qwen2.5, llama3.1, phi4]\n\n  classify:\n    min_score_for_routing: 0.90   # higher bar for classification\n    fallback_chain: [qwen2.5, granite4, llama3.1]"
      },
      {
        "title": "Step 7 — API",
        "body": "Expose a simple HTTP API (FastAPI):\n\nPOST /run          — route a task through the best available model\nGET  /health       — service status + promoted models + ollama connectivity\nGET  /status       — full scoreboard (model × task × mean × n)\nGET  /report       — cost heatmap + efficiency analysis"
      },
      {
        "title": "Key lessons learned (from 900+ production runs)",
        "body": "What worked:\n\nphi4 as control floor: a measured floor model prevents \"promoted because everyone\nelse is also bad\" errors. If the floor model beats a candidate, flag it — don't promote.\nThinking token stripping: CoT models (deepseek-r1, qwen2.5-coder with reasoning)\nmust have <think>...</think> blocks stripped before evaluation. Otherwise Layer 2\ndrift detection flags the reasoning chain as hallucinated content.\nNone ≠ 0.0 for unsampled runs: a run where no judge scored is not a failing run.\nStore None, exclude from mean. Mixing None with 0.0 poisons the mean.\nrequire_json: False for plain-text tasks: classify and extract tasks that return\nformatted text (not JSON objects) will fail Layer 1 if you require JSON. Separate\nthe \"is the format correct\" check from \"is it valid JSON.\"\nPer-task weight overrides: do not use one weight profile for all task types.\nStructural accuracy (difflib) is wrong for prose analysis — use semantic similarity as\nthe primary signal for open-ended tasks. This lifted analyze mean from 0.44–0.59 to 0.70.\nStructured output prompts for analyze tasks: add a system_prompt that specifies\nan exact output format (Finding/Recommendation/Confidence/Reasoning). Both GT and\ncandidates follow the same template, improving structural alignment and reducing drift\npenalty. Without this, Layer 2 drift fires on differently-phrased but correct analyses.\nMCP server for agentic access: expose CP as MCP tools (run_task, get_status,\nget_champions, get_promotion_timeline, get_cost_heatmap). Lets an LLM agent\nquery evaluation state without bespoke integration work.\n\nWhat didn't work:\n\nLarge models (>9GB): gpt-oss:20b and similar required 39+ second inference —\nthe latency dimension alone tanks the composite score. Practical ceiling is ~9GB models\non 24GB unified memory to avoid GPU memory swapping.\n100% judge sampling: runs through the full Claude+GPT+Gemini panel on every evaluation\ncosts more in judge API fees than you save by routing locally. Sample at 15%.\nChroma 1.5.1 with Python 3.14: Pydantic V1 BaseSettings incompatibility. Use\nqdrant or numpy cosine store instead.\nOne-size-fits-all weight profiles: defining global weights at system init and never\noverriding per task type led to all analyze evals silently failing for 112+ runs.\nLesson: evaluate your evaluator's scores by task type early — if a whole task type\ncaps at a suspicious ceiling (e.g. 0.59), the metric is wrong, not the models."
      },
      {
        "title": "Expected timeline",
        "body": "With a 20-minute accumulator cadence and 9 candidates × 7 task types:\n\nFirst 50 runs per model: ~5 hours\nFirst promotions (200 runs): ~1-2 days per model/task pair\nStable routing layer: 1-2 weeks"
      },
      {
        "title": "Cost estimate",
        "body": "Per accumulation cycle (one task, one model):\n\nGround truth: ~$0.002 (Claude Sonnet, ~500 input + 200 output tokens)\nJudge sample (15%): ~$0.003 (Sonnet + GPT-4o-mini)\nLocal model: $0 (Ollama, on-device)\n\nAt 6 runs/hour × 24 hours: ~$0.70/day during burn-in.\nAfter first promotions: drops to ~$0.10/day (90%+ of task volume local)."
      }
    ],
    "body": "llm-eval-router\n\nSet up a production-quality shadow evaluation pipeline that automatically promotes local Ollama models when they statistically prove they match cloud model quality — reducing inference costs with evidence, not hope.\n\nThe core idea\n\nRun every task through your best local model (shadow) in parallel with your cloud baseline (ground truth). A lightweight judge ensemble scores the local output. After 200+ runs, if the local model hits 0.95 mean score, promote it to handle that task type in production. Demote it automatically if quality drops.\n\nWhen to use\nYou're paying for Claude/GPT API calls on tasks that don't need that quality\nYou have Ollama running locally with capable models (qwen2.5, phi4, mistral, etc.)\nYou want evidence-based cost reduction, not blind routing\nYou have defined task types: summarize, classify, extract, format, analyze, RAG\nWhen NOT to use\nTasks that require real-time web knowledge (use cloud)\nTasks with strict latency requirements < 2 seconds (local models on CPU are slow)\nTasks with high safety stakes (always use cloud with safety filters)\nYou don't have Ollama or a Mac/Linux machine with enough RAM (8GB+ per model)\nPrerequisites\nOllama installed and running (ollama.com)\nAt least one capable model: ollama pull qwen2.5 or ollama pull phi4\nPython 3.10+\nAPI keys: Anthropic (ground truth) + OpenAI (judge) — Gemini optional (tiebreaker)\nLangfuse for observability (self-hosted or cloud) — optional but strongly recommended\nNetwork & Privacy\n\nThis skill makes outbound API calls to:\n\nAnthropic API — to generate ground truth baseline responses (every accumulation cycle)\nOpenAI API — for judge scoring (sampled at 15% of runs)\nGoogle Gemini API — tiebreaker judge only (when primary judges disagree by ≥0.20)\n\nWhat stays local:\n\nAll Ollama model inference runs entirely on your device\nScored run data is stored on disk in data/scores/*.json\nNo telemetry, analytics, or data collection of any kind\nNo data is sent anywhere other than the explicit API calls above\n\nLangfuse (optional) can be self-hosted or cloud. If self-hosted, all observability data stays on your network.\n\nCore concepts\n6-Dimension Evaluation\n\nEvery response is scored on:\n\nDimension\tDefault weight\tAnalyze weight\tWhat it measures\nStructural\t25%\t10%\tFormat compliance, required keys present\nSemantic\t25%\t40%\tMeaning equivalence to ground truth\nFactual\t20%\t25%\tNo hallucinated facts/numbers/entities\nCompletion\t15%\t18%\tTask fully addressed\nTool use\t10%\t4%\tCorrect tool/format selection\nLatency\t5%\t3%\tWithin acceptable bounds\n\nImportant: Use per-task weight overrides. The default 25/25 split treats structural accuracy equally with semantic similarity — which works for extract/classify/format tasks (where exact format matters) but is wrong for open-ended analysis. difflib.SequenceMatcher on two prose analyses of the same question scores ~0.29 even when they're semantically identical. With structural weight at 25%, this alone caps analyze scores at ~0.59.\n\n# src/evaluator.py — per-task weight profiles\nTASK_WEIGHT_OVERRIDES = {\n    \"analyze\": {\n        \"structural_accuracy\": 0.10,   # difflib is NOT meaningful for prose\n        \"semantic_similarity\": 0.40,   # cosine over embeddings captures meaning\n        \"factual_drift\": 0.25,\n        \"task_completion\": 0.18,\n        \"tool_correctness\": 0.04,\n        \"latency_score\": 0.03,\n    },\n    \"code_transform\": {\n        \"structural_accuracy\": 0.15,\n        \"semantic_similarity\": 0.35,\n        \"factual_drift\": 0.20,\n        \"task_completion\": 0.20,\n        \"tool_correctness\": 0.07,\n        \"latency_score\": 0.03,\n    },\n}\n\n\nAlso: For analyze tasks, constrain output structure via system_prompt so GT and candidates produce comparably-formatted responses (Finding/Recommendation/Confidence/Reasoning). This reduces Layer 2 drift and improves difflib scores even at reduced weight.\n\nJudge ensemble\nPrimary judges (15% sampling rate): Claude Sonnet + gpt-4o-mini score independently\nTiebreaker (only when |score_A - score_B| ≥ 0.20): Gemini 2.5-flash\nUnsampled runs (85%): Layer 1+2 validators only (deterministic, free)\nPromotion gates always trigger full judge evaluation regardless of sampling rate\nLayer 1+2 validators (free, deterministic)\nLayer 1: JSON validity, required key presence, forbidden pattern check\nLayer 2: Drift detection — novel entities/numbers/URLs not in ground truth\n\nThese run on every response at zero cost. Judges only run when L1+L2 pass and the sampling rate triggers.\n\nPromotion / Demotion\nPromote: 200+ runs, rolling mean ≥ 0.95 for a model/task pair\nDemote: rolling 7-day pass rate < 0.92\nControl floor: one model (phi4, granite4, or similar) serves as the measured floor — any model scoring below it should be flagged, not promoted\nImplementation steps\nStep 1 — Define your task types\n\nCreate config/task_types.yaml:\n\ntasks:\n  - id: summarize\n    description: \"Summarize a document in N sentences\"\n    require_json: false\n    judge_dimensions: [semantic, factual, completion]\n\n  - id: classify\n    description: \"Classify text into one of N categories\"\n    require_json: true    # response must be valid JSON\n    judge_dimensions: [structural, semantic, completion]\n\n  - id: extract\n    description: \"Extract structured data from unstructured text\"\n    require_json: true\n    judge_dimensions: [structural, factual, completion]\n\n  - id: format\n    description: \"Reformat content to match a template\"\n    require_json: false\n    judge_dimensions: [structural, semantic, completion]\n\nStep 2 — Set up the router\n\nThe router assigns each task to a model using a round-robin strategy during burn-in (building n), then switches to confidence-weighted routing after promotion.\n\n# src/router.py — simplified version\nclass Router:\n    def __init__(self, candidates: list[str], control_floor: str):\n        self.candidates = candidates\n        self.control_floor = control_floor\n        self._rr_counters = defaultdict(int)\n\n    def route(self, task_type: str, confidence_tracker: ConfidenceTracker) -> str:\n        \"\"\"Return the best model for this task type.\"\"\"\n        promoted = confidence_tracker.get_promoted(task_type)\n        if promoted:\n            return promoted  # use promoted model directly\n\n        # Round-robin during burn-in for fair exposure\n        idx = self._rr_counters[task_type] % len(self.candidates)\n        self._rr_counters[task_type] += 1\n        return self.candidates[idx]\n\nStep 3 — Ground truth comparison\n\nFor each task, run it through BOTH the local model (candidate) and the cloud baseline (ground truth). Never use the ground truth response in production — it's only for evaluation.\n\nasync def evaluate_pair(prompt: str, local_response: str, gt_response: str,\n                        task_type: str) -> float:\n    # Layer 1: deterministic\n    l1_score = validators.layer1(local_response, task_type)\n    if l1_score == 0.0:\n        return 0.0  # hard fail — safety or format violation\n\n    # Layer 2: heuristic drift\n    l2_score = validators.layer2(local_response, gt_response)\n\n    # Sample judges (15%)\n    if random.random() < JUDGE_SAMPLE_RATE:\n        sonnet_score = await judge_sonnet(prompt, local_response, gt_response)\n        mini_score = await judge_gpt4o_mini(prompt, local_response, gt_response)\n        if abs(sonnet_score - mini_score) >= 0.20:\n            gemini_score = await judge_gemini(prompt, local_response, gt_response)\n            final = median([sonnet_score, mini_score, gemini_score])\n        else:\n            final = (sonnet_score + mini_score) / 2\n        return weighted_score(l1_score, l2_score, final)\n    else:\n        return weighted_score(l1_score, l2_score, judge_score=None)\n\nStep 4 — Confidence tracker\n\nTrack scores per model/task pair on disk (so restarts don't lose data):\n\n# src/scoring/confidence.py — simplified\n@dataclass\nclass ModelStats:\n    model_id: str\n    task_type: str\n    scores: list[float]   # all scores (None excluded)\n    promoted: bool = False\n    demoted: bool = False\n\n    @property\n    def mean(self) -> float:\n        return sum(self.scores) / len(self.scores) if self.scores else 0.0\n\n    @property\n    def n(self) -> int:\n        return len(self.scores)\n\n    def should_promote(self) -> bool:\n        return self.n >= 200 and self.mean >= 0.95 and not self.promoted\n\n    def should_demote(self) -> bool:\n        recent = self.scores[-50:]  # last 50\n        pass_rate = sum(1 for s in recent if s >= 0.85) / len(recent)\n        return pass_rate < 0.92 and not self.demoted\n\nStep 5 — Accumulator loop\n\nRun this on a cron (every 10-20 minutes via launchd/systemd):\n\n# run_accumulate.py\nasync def accumulate():\n    task_type = pick_next_task()  # round-robin across task types\n    prompt, gt_response = generate_task(task_type)  # call cloud baseline\n\n    for candidate in router.get_candidates(task_type):\n        local_response = await ollama_client.complete(candidate, prompt)\n        score = await evaluate_pair(prompt, local_response, gt_response, task_type)\n        confidence_tracker.record(candidate, task_type, score)\n\n        if confidence_tracker.should_promote(candidate, task_type):\n            router.promote(candidate, task_type)\n            langfuse.log_promotion(candidate, task_type, confidence_tracker.stats(candidate, task_type))\n\nStep 6 — Routing policy\n# config/routing_policy.yaml\ncontrol_floor_model: phi4:latest   # never promote below this model's score\n\ntask_policies:\n  policy_check_high_risk:\n    never_local: true              # these tasks always use cloud model\n\n  summarize:\n    min_score_for_routing: 0.85\n    fallback_chain: [qwen2.5, llama3.1, phi4]\n\n  classify:\n    min_score_for_routing: 0.90   # higher bar for classification\n    fallback_chain: [qwen2.5, granite4, llama3.1]\n\nStep 7 — API\n\nExpose a simple HTTP API (FastAPI):\n\nPOST /run          — route a task through the best available model\nGET  /health       — service status + promoted models + ollama connectivity\nGET  /status       — full scoreboard (model × task × mean × n)\nGET  /report       — cost heatmap + efficiency analysis\n\nKey lessons learned (from 900+ production runs)\n\nWhat worked:\n\nphi4 as control floor: a measured floor model prevents \"promoted because everyone else is also bad\" errors. If the floor model beats a candidate, flag it — don't promote.\nThinking token stripping: CoT models (deepseek-r1, qwen2.5-coder with reasoning) must have <think>...</think> blocks stripped before evaluation. Otherwise Layer 2 drift detection flags the reasoning chain as hallucinated content.\nNone ≠ 0.0 for unsampled runs: a run where no judge scored is not a failing run. Store None, exclude from mean. Mixing None with 0.0 poisons the mean.\nrequire_json: False for plain-text tasks: classify and extract tasks that return formatted text (not JSON objects) will fail Layer 1 if you require JSON. Separate the \"is the format correct\" check from \"is it valid JSON.\"\nPer-task weight overrides: do not use one weight profile for all task types. Structural accuracy (difflib) is wrong for prose analysis — use semantic similarity as the primary signal for open-ended tasks. This lifted analyze mean from 0.44–0.59 to 0.70.\nStructured output prompts for analyze tasks: add a system_prompt that specifies an exact output format (Finding/Recommendation/Confidence/Reasoning). Both GT and candidates follow the same template, improving structural alignment and reducing drift penalty. Without this, Layer 2 drift fires on differently-phrased but correct analyses.\nMCP server for agentic access: expose CP as MCP tools (run_task, get_status, get_champions, get_promotion_timeline, get_cost_heatmap). Lets an LLM agent query evaluation state without bespoke integration work.\n\nWhat didn't work:\n\nLarge models (>9GB): gpt-oss:20b and similar required 39+ second inference — the latency dimension alone tanks the composite score. Practical ceiling is ~9GB models on 24GB unified memory to avoid GPU memory swapping.\n100% judge sampling: runs through the full Claude+GPT+Gemini panel on every evaluation costs more in judge API fees than you save by routing locally. Sample at 15%.\nChroma 1.5.1 with Python 3.14: Pydantic V1 BaseSettings incompatibility. Use qdrant or numpy cosine store instead.\nOne-size-fits-all weight profiles: defining global weights at system init and never overriding per task type led to all analyze evals silently failing for 112+ runs. Lesson: evaluate your evaluator's scores by task type early — if a whole task type caps at a suspicious ceiling (e.g. 0.59), the metric is wrong, not the models.\nExpected timeline\n\nWith a 20-minute accumulator cadence and 9 candidates × 7 task types:\n\nFirst 50 runs per model: ~5 hours\nFirst promotions (200 runs): ~1-2 days per model/task pair\nStable routing layer: 1-2 weeks\nCost estimate\n\nPer accumulation cycle (one task, one model):\n\nGround truth: ~$0.002 (Claude Sonnet, ~500 input + 200 output tokens)\nJudge sample (15%): ~$0.003 (Sonnet + GPT-4o-mini)\nLocal model: $0 (Ollama, on-device)\n\nAt 6 runs/hour × 24 hours: ~$0.70/day during burn-in. After first promotions: drops to ~$0.10/day (90%+ of task volume local)."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/nissan/llm-eval-router",
    "publisherUrl": "https://clawhub.ai/nissan/llm-eval-router",
    "owner": "nissan",
    "version": "1.2.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/llm-eval-router",
    "downloadUrl": "https://openagent3.xyz/downloads/llm-eval-router",
    "agentUrl": "https://openagent3.xyz/skills/llm-eval-router/agent",
    "manifestUrl": "https://openagent3.xyz/skills/llm-eval-router/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/llm-eval-router/agent.md"
  }
}