{
  "schemaVersion": "1.0",
  "item": {
    "slug": "adaptivetest",
    "name": "Adaptive Testing",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/woodstocksoftware/adaptivetest",
    "canonicalUrl": "https://clawhub.ai/woodstocksoftware/adaptivetest",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/adaptivetest",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=adaptivetest",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "references/calibration.md",
      "references/implementation.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/adaptivetest"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/adaptivetest",
    "agentPageUrl": "https://openagent3.xyz/skills/adaptivetest/agent",
    "manifestUrl": "https://openagent3.xyz/skills/adaptivetest/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/adaptivetest/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Adaptive Testing with IRT",
        "body": "Design computerized adaptive tests that measure ability efficiently and accurately using Item Response Theory."
      },
      {
        "title": "Core Concept",
        "body": "Adaptive tests adjust difficulty in real-time based on student responses. A correct answer → harder question. Incorrect → easier question. The result: accurate ability estimates in ~50% fewer questions than fixed-length tests.\n\nKey advantage: Traditional tests waste time on too-easy or too-hard questions. Adaptive tests spend time where measurement matters most — near the student's ability level."
      },
      {
        "title": "Quick Decision Tree",
        "body": "You need to...SeeUnderstand IRT models and parametersIRT FundamentalsDesign a new adaptive testTest Design WorkflowChoose item selection algorithmItem SelectionDecide when to stop the testStopping RulesCalibrate new questionsreferences/calibration.mdImplement CAT algorithmreferences/implementation.md"
      },
      {
        "title": "The 3-Parameter Logistic (3PL) Model",
        "body": "Most adaptive tests use the 3PL model. Each question has three parameters:\n\na (discrimination) — How well the question differentiates ability levels. Higher = steeper curve. Typical range: 0.5 to 2.5\nb (difficulty) — The ability level where P(correct) = 0.5. Range: -3 to +3 (standardized scale)\nc (guessing) — Probability of guessing correctly. Usually 0.2 to 0.25 for multiple choice\n\nProbability of correct response:\n\nP(correct | ability, a, b, c) = c + (1 - c) / (1 + e^(-a(ability - b)))\n\nSimpler models:\n\n2PL: Set c = 0 (no guessing parameter)\n1PL (Rasch): Set c = 0 and a = 1 for all items (only difficulty varies)\n\nUse 3PL for high-stakes tests. Use 2PL/1PL when sample size is small (<500 responses per item)."
      },
      {
        "title": "Information and Standard Error",
        "body": "Information measures how precisely an item estimates ability at a given level. Peak information occurs when ability ≈ difficulty (b parameter).\n\nStandard Error (SE) is the inverse of information:\n\nSE = 1 / sqrt(Information)\n\nGoal of CAT: Maximize information (minimize SE) at the student's true ability level."
      },
      {
        "title": "1. Define Test Specifications",
        "body": "Purpose: Placement, diagnostic, certification, progress monitoring?\nContent domain: Single skill or multidimensional?\nTarget population: What ability range (-3 to +3)?\nConstraints: Time limit, minimum/maximum length, content balance"
      },
      {
        "title": "2. Build Item Bank",
        "body": "Minimum bank size: 10× the average test length. For a 20-item CAT, you need ≥200 calibrated items.\n\nDistribution targets:\n\nDifficulty (b): Spread across expected ability range\nDiscrimination (a): Target 1.0 to 2.0 (high discrimination)\nExposure: No item used >20% of the time\n\nContent balancing: If testing math, ensure geometry/algebra/etc. are proportionally represented."
      },
      {
        "title": "3. Choose Algorithms",
        "body": "Pick one from each category:\n\nItem selection: (see below)\n\nMaximum Information\nRandomesque (MFI + exposure control)\nContent balancing\n\nAbility estimation:\n\nMaximum Likelihood Estimation (MLE)\nExpected A Posteriori (EAP) — better for extreme scores\nWeighted Likelihood (WLE)\n\nStopping rule: (see below)\n\nFixed length\nStandard error threshold\nInformation threshold"
      },
      {
        "title": "4. Simulate Performance",
        "body": "Before going live, simulate 1000+ test sessions with known abilities. Check:\n\nAverage test length\nSE at different ability levels\nItem exposure rates\nContent balance adherence\n\nAdjust if needed."
      },
      {
        "title": "Maximum Fisher Information (MFI)",
        "body": "Rule: Select the item with highest information at current ability estimate.\n\nPros: Optimal precision, shortest tests\nCons: Overuses \"best\" items, poor security\n\nUse when: Pilot testing, low-stakes practice"
      },
      {
        "title": "Randomesque (MFI + Exposure Control)",
        "body": "Rule: Select from top N items by information (e.g., top 5), choose randomly from that set.\n\nPros: Balances precision and security\nCons: Slightly longer tests than pure MFI\n\nUse when: Operational tests, default choice"
      },
      {
        "title": "a-Stratified",
        "body": "Rule: Start with high-discrimination items (high a), use mid-discrimination later.\n\nPros: Fast initial ability estimate\nCons: Complex to implement\n\nUse when: Very large item banks, research settings"
      },
      {
        "title": "Content Balancing",
        "body": "Rule: Track content area usage, prioritize underrepresented areas when selecting next item.\n\nImplementation: Weight information by content constraint satisfaction.\n\nUse when: Blueprint requirements, multidimensional tests"
      },
      {
        "title": "Fixed Length",
        "body": "Stop after N items (e.g., 20 questions).\n\nPros: Predictable time, simple\nCons: May over/under-test some students\n\nUse when: Time limits matter, simple implementation needed"
      },
      {
        "title": "Standard Error Threshold",
        "body": "Stop when SE < target (e.g., SE < 0.3).\n\nPros: Consistent precision across ability levels\nCons: Variable test length (harder to schedule)\n\nTypical targets:\n\nLow-stakes: SE < 0.4\nMedium-stakes: SE < 0.3\nHigh-stakes: SE < 0.25\n\nUse when: Precision matters more than time"
      },
      {
        "title": "Combined Rule",
        "body": "Stop when (SE < target) OR (length ≥ max) OR (length ≥ min AND ability estimate stable).\n\nUse when: Production systems (safest approach)"
      },
      {
        "title": "Starting Ability Estimate",
        "body": "Options:\n\nPopulation mean (θ = 0)\nPrior information (e.g., grade level, previous test)\nFirst question is medium difficulty, estimate from there\n\nNever start at extremes (-3 or +3)."
      },
      {
        "title": "Handling Extreme Response Patterns",
        "body": "All correct or all incorrect: MLE fails. Use EAP or Bayesian prior to regularize.\n\nRapid changes: If ability estimate jumps >1.0, consider response anomaly (cheating, guessing)."
      },
      {
        "title": "Exposure Control",
        "body": "Track how often each item is used. Flag items used >20% of the time. Consider:\n\nRandomesque selection (above)\nSympson-Hetter method (advanced)\nPeriodic item bank refresh"
      },
      {
        "title": "Multidimensional IRT (MIRT)",
        "body": "If testing multiple skills (e.g., algebra + geometry), use separate ability estimates per dimension. Select items to balance information across dimensions.\n\nWarning: MIRT requires larger item banks and more complex calibration."
      },
      {
        "title": "Common Mistakes",
        "body": "❌ Too few items in bank → High exposure, security risk\n✅ Aim for 10× average test length\n\n❌ Poorly distributed difficulties → Accurate only in narrow ability range\n✅ Spread items across -2 to +2 difficulty\n\n❌ Ignoring content balance → May skip important topics\n✅ Build content constraints into item selection\n\n❌ Using MLE for all incorrect → Returns -∞\n✅ Use EAP or cap estimates at -3/+3\n\n❌ No exposure control → Same items every test\n✅ Use randomesque or Sympson-Hetter"
      },
      {
        "title": "When to Load References",
        "body": "NeedFileCalibrate new items (collect data, estimate parameters)references/calibration.mdImplement CAT algorithm (code patterns, libraries)references/implementation.md"
      },
      {
        "title": "Real-World Example: K-12 Math Placement",
        "body": "Setup:\n\nItem bank: 300 questions, b from -2 (basic) to +2 (advanced)\nTarget: SE < 0.35 or max 25 questions\nContent: 40% algebra, 30% geometry, 30% statistics\nAlgorithm: Randomesque (top 5), EAP estimation\n\nFlow:\n\nStart at θ = 0 (grade-level average)\nSelect item: b ≈ 0, content area needed\nStudent answers → update ability estimate (EAP)\nSelect next: maximize information at new θ, respect content balance, randomesque from top 5\nStop when SE < 0.35 or 25 questions reached\nReport: ability estimate + placement recommendation\n\nResult: Average 18 questions, 95% of students placed within ±0.5 grade levels of true ability."
      },
      {
        "title": "Further Reading",
        "body": "Lord, F. M. (1980). Applications of Item Response Theory to Practical Testing Problems\nWainer, H. (2000). Computerized Adaptive Testing: A Primer (2nd ed.)\nvan der Linden, W. J., & Glas, C. A. W. (2010). Elements of Adaptive Testing\n\nIRT packages:\n\nPython: mirt, girth, catsim\nR: mirt, TAM, catR\nProduction: Custom implementation or AdaptiveTest.io"
      }
    ],
    "body": "Adaptive Testing with IRT\n\nDesign computerized adaptive tests that measure ability efficiently and accurately using Item Response Theory.\n\nCore Concept\n\nAdaptive tests adjust difficulty in real-time based on student responses. A correct answer → harder question. Incorrect → easier question. The result: accurate ability estimates in ~50% fewer questions than fixed-length tests.\n\nKey advantage: Traditional tests waste time on too-easy or too-hard questions. Adaptive tests spend time where measurement matters most — near the student's ability level.\n\nQuick Decision Tree\nYou need to...\tSee\nUnderstand IRT models and parameters\tIRT Fundamentals\nDesign a new adaptive test\tTest Design Workflow\nChoose item selection algorithm\tItem Selection\nDecide when to stop the test\tStopping Rules\nCalibrate new questions\treferences/calibration.md\nImplement CAT algorithm\treferences/implementation.md\nIRT Fundamentals\nThe 3-Parameter Logistic (3PL) Model\n\nMost adaptive tests use the 3PL model. Each question has three parameters:\n\na (discrimination) — How well the question differentiates ability levels. Higher = steeper curve. Typical range: 0.5 to 2.5\nb (difficulty) — The ability level where P(correct) = 0.5. Range: -3 to +3 (standardized scale)\nc (guessing) — Probability of guessing correctly. Usually 0.2 to 0.25 for multiple choice\n\nProbability of correct response:\n\nP(correct | ability, a, b, c) = c + (1 - c) / (1 + e^(-a(ability - b)))\n\n\nSimpler models:\n\n2PL: Set c = 0 (no guessing parameter)\n1PL (Rasch): Set c = 0 and a = 1 for all items (only difficulty varies)\n\nUse 3PL for high-stakes tests. Use 2PL/1PL when sample size is small (<500 responses per item).\n\nInformation and Standard Error\n\nInformation measures how precisely an item estimates ability at a given level. Peak information occurs when ability ≈ difficulty (b parameter).\n\nStandard Error (SE) is the inverse of information:\n\nSE = 1 / sqrt(Information)\n\n\nGoal of CAT: Maximize information (minimize SE) at the student's true ability level.\n\nTest Design Workflow\n1. Define Test Specifications\nPurpose: Placement, diagnostic, certification, progress monitoring?\nContent domain: Single skill or multidimensional?\nTarget population: What ability range (-3 to +3)?\nConstraints: Time limit, minimum/maximum length, content balance\n2. Build Item Bank\n\nMinimum bank size: 10× the average test length. For a 20-item CAT, you need ≥200 calibrated items.\n\nDistribution targets:\n\nDifficulty (b): Spread across expected ability range\nDiscrimination (a): Target 1.0 to 2.0 (high discrimination)\nExposure: No item used >20% of the time\n\nContent balancing: If testing math, ensure geometry/algebra/etc. are proportionally represented.\n\n3. Choose Algorithms\n\nPick one from each category:\n\nItem selection: (see below)\n\nMaximum Information\nRandomesque (MFI + exposure control)\nContent balancing\n\nAbility estimation:\n\nMaximum Likelihood Estimation (MLE)\nExpected A Posteriori (EAP) — better for extreme scores\nWeighted Likelihood (WLE)\n\nStopping rule: (see below)\n\nFixed length\nStandard error threshold\nInformation threshold\n4. Simulate Performance\n\nBefore going live, simulate 1000+ test sessions with known abilities. Check:\n\nAverage test length\nSE at different ability levels\nItem exposure rates\nContent balance adherence\n\nAdjust if needed.\n\nItem Selection Strategies\nMaximum Fisher Information (MFI)\n\nRule: Select the item with highest information at current ability estimate.\n\nPros: Optimal precision, shortest tests Cons: Overuses \"best\" items, poor security\n\nUse when: Pilot testing, low-stakes practice\n\nRandomesque (MFI + Exposure Control)\n\nRule: Select from top N items by information (e.g., top 5), choose randomly from that set.\n\nPros: Balances precision and security Cons: Slightly longer tests than pure MFI\n\nUse when: Operational tests, default choice\n\na-Stratified\n\nRule: Start with high-discrimination items (high a), use mid-discrimination later.\n\nPros: Fast initial ability estimate Cons: Complex to implement\n\nUse when: Very large item banks, research settings\n\nContent Balancing\n\nRule: Track content area usage, prioritize underrepresented areas when selecting next item.\n\nImplementation: Weight information by content constraint satisfaction.\n\nUse when: Blueprint requirements, multidimensional tests\n\nStopping Rules\nFixed Length\n\nStop after N items (e.g., 20 questions).\n\nPros: Predictable time, simple Cons: May over/under-test some students\n\nUse when: Time limits matter, simple implementation needed\n\nStandard Error Threshold\n\nStop when SE < target (e.g., SE < 0.3).\n\nPros: Consistent precision across ability levels Cons: Variable test length (harder to schedule)\n\nTypical targets:\n\nLow-stakes: SE < 0.4\nMedium-stakes: SE < 0.3\nHigh-stakes: SE < 0.25\n\nUse when: Precision matters more than time\n\nCombined Rule\n\nStop when (SE < target) OR (length ≥ max) OR (length ≥ min AND ability estimate stable).\n\nUse when: Production systems (safest approach)\n\nPractical Considerations\nStarting Ability Estimate\n\nOptions:\n\nPopulation mean (θ = 0)\nPrior information (e.g., grade level, previous test)\nFirst question is medium difficulty, estimate from there\n\nNever start at extremes (-3 or +3).\n\nHandling Extreme Response Patterns\n\nAll correct or all incorrect: MLE fails. Use EAP or Bayesian prior to regularize.\n\nRapid changes: If ability estimate jumps >1.0, consider response anomaly (cheating, guessing).\n\nExposure Control\n\nTrack how often each item is used. Flag items used >20% of the time. Consider:\n\nRandomesque selection (above)\nSympson-Hetter method (advanced)\nPeriodic item bank refresh\nMultidimensional IRT (MIRT)\n\nIf testing multiple skills (e.g., algebra + geometry), use separate ability estimates per dimension. Select items to balance information across dimensions.\n\nWarning: MIRT requires larger item banks and more complex calibration.\n\nCommon Mistakes\n\n❌ Too few items in bank → High exposure, security risk ✅ Aim for 10× average test length\n\n❌ Poorly distributed difficulties → Accurate only in narrow ability range\n✅ Spread items across -2 to +2 difficulty\n\n❌ Ignoring content balance → May skip important topics\n✅ Build content constraints into item selection\n\n❌ Using MLE for all incorrect → Returns -∞\n✅ Use EAP or cap estimates at -3/+3\n\n❌ No exposure control → Same items every test\n✅ Use randomesque or Sympson-Hetter\n\nWhen to Load References\nNeed\tFile\nCalibrate new items (collect data, estimate parameters)\treferences/calibration.md\nImplement CAT algorithm (code patterns, libraries)\treferences/implementation.md\nReal-World Example: K-12 Math Placement\n\nSetup:\n\nItem bank: 300 questions, b from -2 (basic) to +2 (advanced)\nTarget: SE < 0.35 or max 25 questions\nContent: 40% algebra, 30% geometry, 30% statistics\nAlgorithm: Randomesque (top 5), EAP estimation\n\nFlow:\n\nStart at θ = 0 (grade-level average)\nSelect item: b ≈ 0, content area needed\nStudent answers → update ability estimate (EAP)\nSelect next: maximize information at new θ, respect content balance, randomesque from top 5\nStop when SE < 0.35 or 25 questions reached\nReport: ability estimate + placement recommendation\n\nResult: Average 18 questions, 95% of students placed within ±0.5 grade levels of true ability.\n\nFurther Reading\nLord, F. M. (1980). Applications of Item Response Theory to Practical Testing Problems\nWainer, H. (2000). Computerized Adaptive Testing: A Primer (2nd ed.)\nvan der Linden, W. J., & Glas, C. A. W. (2010). Elements of Adaptive Testing\n\nIRT packages:\n\nPython: mirt, girth, catsim\nR: mirt, TAM, catR\nProduction: Custom implementation or AdaptiveTest.io"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/woodstocksoftware/adaptivetest",
    "publisherUrl": "https://clawhub.ai/woodstocksoftware/adaptivetest",
    "owner": "woodstocksoftware",
    "version": "1.0.3",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/adaptivetest",
    "downloadUrl": "https://openagent3.xyz/downloads/adaptivetest",
    "agentUrl": "https://openagent3.xyz/skills/adaptivetest/agent",
    "manifestUrl": "https://openagent3.xyz/skills/adaptivetest/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/adaptivetest/agent.md"
  }
}