{
  "schemaVersion": "1.0",
  "item": {
    "slug": "azure-ai-evaluation-py",
    "name": "Azure Ai Evaluation Py",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/thegovind/azure-ai-evaluation-py",
    "canonicalUrl": "https://clawhub.ai/thegovind/azure-ai-evaluation-py",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/azure-ai-evaluation-py",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=azure-ai-evaluation-py",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "references/built-in-evaluators.md",
      "references/custom-evaluators.md",
      "scripts/run_batch_evaluation.py"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/azure-ai-evaluation-py"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/azure-ai-evaluation-py",
    "agentPageUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent",
    "manifestUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Azure AI Evaluation SDK for Python",
        "body": "Assess generative AI application performance with built-in and custom evaluators."
      },
      {
        "title": "Installation",
        "body": "pip install azure-ai-evaluation\n\n# With remote evaluation support\npip install azure-ai-evaluation[remote]"
      },
      {
        "title": "Environment Variables",
        "body": "# For AI-assisted evaluators\nAZURE_OPENAI_ENDPOINT=https://<resource>.openai.azure.com\nAZURE_OPENAI_API_KEY=<your-api-key>\nAZURE_OPENAI_DEPLOYMENT=gpt-4o-mini\n\n# For Foundry project integration\nAIPROJECT_CONNECTION_STRING=<your-connection-string>"
      },
      {
        "title": "Quality Evaluators (AI-Assisted)",
        "body": "from azure.ai.evaluation import (\n    GroundednessEvaluator,\n    RelevanceEvaluator,\n    CoherenceEvaluator,\n    FluencyEvaluator,\n    SimilarityEvaluator,\n    RetrievalEvaluator\n)\n\n# Initialize with Azure OpenAI model config\nmodel_config = {\n    \"azure_endpoint\": os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n    \"api_key\": os.environ[\"AZURE_OPENAI_API_KEY\"],\n    \"azure_deployment\": os.environ[\"AZURE_OPENAI_DEPLOYMENT\"]\n}\n\ngroundedness = GroundednessEvaluator(model_config)\nrelevance = RelevanceEvaluator(model_config)\ncoherence = CoherenceEvaluator(model_config)"
      },
      {
        "title": "Quality Evaluators (NLP-based)",
        "body": "from azure.ai.evaluation import (\n    F1ScoreEvaluator,\n    RougeScoreEvaluator,\n    BleuScoreEvaluator,\n    GleuScoreEvaluator,\n    MeteorScoreEvaluator\n)\n\nf1 = F1ScoreEvaluator()\nrouge = RougeScoreEvaluator()\nbleu = BleuScoreEvaluator()"
      },
      {
        "title": "Safety Evaluators",
        "body": "from azure.ai.evaluation import (\n    ViolenceEvaluator,\n    SexualEvaluator,\n    SelfHarmEvaluator,\n    HateUnfairnessEvaluator,\n    IndirectAttackEvaluator,\n    ProtectedMaterialEvaluator\n)\n\nviolence = ViolenceEvaluator(azure_ai_project=project_scope)\nsexual = SexualEvaluator(azure_ai_project=project_scope)"
      },
      {
        "title": "Single Row Evaluation",
        "body": "from azure.ai.evaluation import GroundednessEvaluator\n\ngroundedness = GroundednessEvaluator(model_config)\n\nresult = groundedness(\n    query=\"What is Azure AI?\",\n    context=\"Azure AI is Microsoft's AI platform...\",\n    response=\"Azure AI provides AI services and tools.\"\n)\n\nprint(f\"Groundedness score: {result['groundedness']}\")\nprint(f\"Reason: {result['groundedness_reason']}\")"
      },
      {
        "title": "Batch Evaluation with evaluate()",
        "body": "from azure.ai.evaluation import evaluate\n\nresult = evaluate(\n    data=\"test_data.jsonl\",\n    evaluators={\n        \"groundedness\": groundedness,\n        \"relevance\": relevance,\n        \"coherence\": coherence\n    },\n    evaluator_config={\n        \"default\": {\n            \"column_mapping\": {\n                \"query\": \"${data.query}\",\n                \"context\": \"${data.context}\",\n                \"response\": \"${data.response}\"\n            }\n        }\n    }\n)\n\nprint(result[\"metrics\"])"
      },
      {
        "title": "Composite Evaluators",
        "body": "from azure.ai.evaluation import QAEvaluator, ContentSafetyEvaluator\n\n# All quality metrics in one\nqa_evaluator = QAEvaluator(model_config)\n\n# All safety metrics in one\nsafety_evaluator = ContentSafetyEvaluator(azure_ai_project=project_scope)\n\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\n        \"qa\": qa_evaluator,\n        \"content_safety\": safety_evaluator\n    }\n)"
      },
      {
        "title": "Evaluate Application Target",
        "body": "from azure.ai.evaluation import evaluate\nfrom my_app import chat_app  # Your application\n\nresult = evaluate(\n    data=\"queries.jsonl\",\n    target=chat_app,  # Callable that takes query, returns response\n    evaluators={\n        \"groundedness\": groundedness\n    },\n    evaluator_config={\n        \"default\": {\n            \"column_mapping\": {\n                \"query\": \"${data.query}\",\n                \"context\": \"${outputs.context}\",\n                \"response\": \"${outputs.response}\"\n            }\n        }\n    }\n)"
      },
      {
        "title": "Code-Based",
        "body": "from azure.ai.evaluation import evaluator\n\n@evaluator\ndef word_count_evaluator(response: str) -> dict:\n    return {\"word_count\": len(response.split())}\n\n# Use in evaluate()\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\"word_count\": word_count_evaluator}\n)"
      },
      {
        "title": "Prompt-Based",
        "body": "from azure.ai.evaluation import PromptChatTarget\n\nclass CustomEvaluator:\n    def __init__(self, model_config):\n        self.model = PromptChatTarget(model_config)\n    \n    def __call__(self, query: str, response: str) -> dict:\n        prompt = f\"Rate this response 1-5: Query: {query}, Response: {response}\"\n        result = self.model.send_prompt(prompt)\n        return {\"custom_score\": int(result)}"
      },
      {
        "title": "Log to Foundry Project",
        "body": "from azure.ai.projects import AIProjectClient\nfrom azure.identity import DefaultAzureCredential\n\nproject = AIProjectClient.from_connection_string(\n    conn_str=os.environ[\"AIPROJECT_CONNECTION_STRING\"],\n    credential=DefaultAzureCredential()\n)\n\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\"groundedness\": groundedness},\n    azure_ai_project=project.scope  # Logs results to Foundry\n)\n\nprint(f\"View results: {result['studio_url']}\")"
      },
      {
        "title": "Evaluator Reference",
        "body": "EvaluatorTypeMetricsGroundednessEvaluatorAIgroundedness (1-5)RelevanceEvaluatorAIrelevance (1-5)CoherenceEvaluatorAIcoherence (1-5)FluencyEvaluatorAIfluency (1-5)SimilarityEvaluatorAIsimilarity (1-5)RetrievalEvaluatorAIretrieval (1-5)F1ScoreEvaluatorNLPf1_score (0-1)RougeScoreEvaluatorNLProuge scoresViolenceEvaluatorSafetyviolence (0-7)SexualEvaluatorSafetysexual (0-7)SelfHarmEvaluatorSafetyself_harm (0-7)HateUnfairnessEvaluatorSafetyhate_unfairness (0-7)QAEvaluatorCompositeAll quality metricsContentSafetyEvaluatorCompositeAll safety metrics"
      },
      {
        "title": "Best Practices",
        "body": "Use composite evaluators for comprehensive assessment\nMap columns correctly — mismatched columns cause silent failures\nLog to Foundry for tracking and comparison across runs\nCreate custom evaluators for domain-specific metrics\nUse NLP evaluators when you have ground truth answers\nSafety evaluators require Azure AI project scope\nBatch evaluation is more efficient than single-row loops"
      },
      {
        "title": "Reference Files",
        "body": "FileContentsreferences/built-in-evaluators.mdDetailed patterns for AI-assisted, NLP-based, and Safety evaluators with configuration tablesreferences/custom-evaluators.mdCreating code-based and prompt-based custom evaluators, testing patternsscripts/run_batch_evaluation.pyCLI tool for running batch evaluations with quality, safety, and custom evaluators"
      }
    ],
    "body": "Azure AI Evaluation SDK for Python\n\nAssess generative AI application performance with built-in and custom evaluators.\n\nInstallation\npip install azure-ai-evaluation\n\n# With remote evaluation support\npip install azure-ai-evaluation[remote]\n\nEnvironment Variables\n# For AI-assisted evaluators\nAZURE_OPENAI_ENDPOINT=https://<resource>.openai.azure.com\nAZURE_OPENAI_API_KEY=<your-api-key>\nAZURE_OPENAI_DEPLOYMENT=gpt-4o-mini\n\n# For Foundry project integration\nAIPROJECT_CONNECTION_STRING=<your-connection-string>\n\nBuilt-in Evaluators\nQuality Evaluators (AI-Assisted)\nfrom azure.ai.evaluation import (\n    GroundednessEvaluator,\n    RelevanceEvaluator,\n    CoherenceEvaluator,\n    FluencyEvaluator,\n    SimilarityEvaluator,\n    RetrievalEvaluator\n)\n\n# Initialize with Azure OpenAI model config\nmodel_config = {\n    \"azure_endpoint\": os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n    \"api_key\": os.environ[\"AZURE_OPENAI_API_KEY\"],\n    \"azure_deployment\": os.environ[\"AZURE_OPENAI_DEPLOYMENT\"]\n}\n\ngroundedness = GroundednessEvaluator(model_config)\nrelevance = RelevanceEvaluator(model_config)\ncoherence = CoherenceEvaluator(model_config)\n\nQuality Evaluators (NLP-based)\nfrom azure.ai.evaluation import (\n    F1ScoreEvaluator,\n    RougeScoreEvaluator,\n    BleuScoreEvaluator,\n    GleuScoreEvaluator,\n    MeteorScoreEvaluator\n)\n\nf1 = F1ScoreEvaluator()\nrouge = RougeScoreEvaluator()\nbleu = BleuScoreEvaluator()\n\nSafety Evaluators\nfrom azure.ai.evaluation import (\n    ViolenceEvaluator,\n    SexualEvaluator,\n    SelfHarmEvaluator,\n    HateUnfairnessEvaluator,\n    IndirectAttackEvaluator,\n    ProtectedMaterialEvaluator\n)\n\nviolence = ViolenceEvaluator(azure_ai_project=project_scope)\nsexual = SexualEvaluator(azure_ai_project=project_scope)\n\nSingle Row Evaluation\nfrom azure.ai.evaluation import GroundednessEvaluator\n\ngroundedness = GroundednessEvaluator(model_config)\n\nresult = groundedness(\n    query=\"What is Azure AI?\",\n    context=\"Azure AI is Microsoft's AI platform...\",\n    response=\"Azure AI provides AI services and tools.\"\n)\n\nprint(f\"Groundedness score: {result['groundedness']}\")\nprint(f\"Reason: {result['groundedness_reason']}\")\n\nBatch Evaluation with evaluate()\nfrom azure.ai.evaluation import evaluate\n\nresult = evaluate(\n    data=\"test_data.jsonl\",\n    evaluators={\n        \"groundedness\": groundedness,\n        \"relevance\": relevance,\n        \"coherence\": coherence\n    },\n    evaluator_config={\n        \"default\": {\n            \"column_mapping\": {\n                \"query\": \"${data.query}\",\n                \"context\": \"${data.context}\",\n                \"response\": \"${data.response}\"\n            }\n        }\n    }\n)\n\nprint(result[\"metrics\"])\n\nComposite Evaluators\nfrom azure.ai.evaluation import QAEvaluator, ContentSafetyEvaluator\n\n# All quality metrics in one\nqa_evaluator = QAEvaluator(model_config)\n\n# All safety metrics in one\nsafety_evaluator = ContentSafetyEvaluator(azure_ai_project=project_scope)\n\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\n        \"qa\": qa_evaluator,\n        \"content_safety\": safety_evaluator\n    }\n)\n\nEvaluate Application Target\nfrom azure.ai.evaluation import evaluate\nfrom my_app import chat_app  # Your application\n\nresult = evaluate(\n    data=\"queries.jsonl\",\n    target=chat_app,  # Callable that takes query, returns response\n    evaluators={\n        \"groundedness\": groundedness\n    },\n    evaluator_config={\n        \"default\": {\n            \"column_mapping\": {\n                \"query\": \"${data.query}\",\n                \"context\": \"${outputs.context}\",\n                \"response\": \"${outputs.response}\"\n            }\n        }\n    }\n)\n\nCustom Evaluators\nCode-Based\nfrom azure.ai.evaluation import evaluator\n\n@evaluator\ndef word_count_evaluator(response: str) -> dict:\n    return {\"word_count\": len(response.split())}\n\n# Use in evaluate()\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\"word_count\": word_count_evaluator}\n)\n\nPrompt-Based\nfrom azure.ai.evaluation import PromptChatTarget\n\nclass CustomEvaluator:\n    def __init__(self, model_config):\n        self.model = PromptChatTarget(model_config)\n    \n    def __call__(self, query: str, response: str) -> dict:\n        prompt = f\"Rate this response 1-5: Query: {query}, Response: {response}\"\n        result = self.model.send_prompt(prompt)\n        return {\"custom_score\": int(result)}\n\nLog to Foundry Project\nfrom azure.ai.projects import AIProjectClient\nfrom azure.identity import DefaultAzureCredential\n\nproject = AIProjectClient.from_connection_string(\n    conn_str=os.environ[\"AIPROJECT_CONNECTION_STRING\"],\n    credential=DefaultAzureCredential()\n)\n\nresult = evaluate(\n    data=\"data.jsonl\",\n    evaluators={\"groundedness\": groundedness},\n    azure_ai_project=project.scope  # Logs results to Foundry\n)\n\nprint(f\"View results: {result['studio_url']}\")\n\nEvaluator Reference\nEvaluator\tType\tMetrics\nGroundednessEvaluator\tAI\tgroundedness (1-5)\nRelevanceEvaluator\tAI\trelevance (1-5)\nCoherenceEvaluator\tAI\tcoherence (1-5)\nFluencyEvaluator\tAI\tfluency (1-5)\nSimilarityEvaluator\tAI\tsimilarity (1-5)\nRetrievalEvaluator\tAI\tretrieval (1-5)\nF1ScoreEvaluator\tNLP\tf1_score (0-1)\nRougeScoreEvaluator\tNLP\trouge scores\nViolenceEvaluator\tSafety\tviolence (0-7)\nSexualEvaluator\tSafety\tsexual (0-7)\nSelfHarmEvaluator\tSafety\tself_harm (0-7)\nHateUnfairnessEvaluator\tSafety\thate_unfairness (0-7)\nQAEvaluator\tComposite\tAll quality metrics\nContentSafetyEvaluator\tComposite\tAll safety metrics\nBest Practices\nUse composite evaluators for comprehensive assessment\nMap columns correctly — mismatched columns cause silent failures\nLog to Foundry for tracking and comparison across runs\nCreate custom evaluators for domain-specific metrics\nUse NLP evaluators when you have ground truth answers\nSafety evaluators require Azure AI project scope\nBatch evaluation is more efficient than single-row loops\nReference Files\nFile\tContents\nreferences/built-in-evaluators.md\tDetailed patterns for AI-assisted, NLP-based, and Safety evaluators with configuration tables\nreferences/custom-evaluators.md\tCreating code-based and prompt-based custom evaluators, testing patterns\nscripts/run_batch_evaluation.py\tCLI tool for running batch evaluations with quality, safety, and custom evaluators"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/thegovind/azure-ai-evaluation-py",
    "publisherUrl": "https://clawhub.ai/thegovind/azure-ai-evaluation-py",
    "owner": "thegovind",
    "version": "0.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py",
    "downloadUrl": "https://openagent3.xyz/downloads/azure-ai-evaluation-py",
    "agentUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent",
    "manifestUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/azure-ai-evaluation-py/agent.md"
  }
}