{
  "schemaVersion": "1.0",
  "item": {
    "slug": "azure-ai-voicelive-py",
    "name": "Azure Ai Voicelive Py",
    "source": "tencent",
    "type": "skill",
    "category": "效率提升",
    "sourceUrl": "https://clawhub.ai/thegovind/azure-ai-voicelive-py",
    "canonicalUrl": "https://clawhub.ai/thegovind/azure-ai-voicelive-py",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/azure-ai-voicelive-py",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=azure-ai-voicelive-py",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "references/api-reference.md",
      "references/examples.md",
      "references/models.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/azure-ai-voicelive-py"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/azure-ai-voicelive-py",
    "agentPageUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent",
    "manifestUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Azure AI Voice Live SDK",
        "body": "Build real-time voice AI applications with bidirectional WebSocket communication."
      },
      {
        "title": "Installation",
        "body": "pip install azure-ai-voicelive aiohttp azure-identity"
      },
      {
        "title": "Environment Variables",
        "body": "AZURE_COGNITIVE_SERVICES_ENDPOINT=https://<region>.api.cognitive.microsoft.com\n# For API key auth (not recommended for production)\nAZURE_COGNITIVE_SERVICES_KEY=<api-key>"
      },
      {
        "title": "Authentication",
        "body": "DefaultAzureCredential (preferred):\n\nfrom azure.ai.voicelive.aio import connect\nfrom azure.identity.aio import DefaultAzureCredential\n\nasync with connect(\n    endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n    credential=DefaultAzureCredential(),\n    model=\"gpt-4o-realtime-preview\",\n    credential_scopes=[\"https://cognitiveservices.azure.com/.default\"]\n) as conn:\n    ...\n\nAPI Key:\n\nfrom azure.ai.voicelive.aio import connect\nfrom azure.core.credentials import AzureKeyCredential\n\nasync with connect(\n    endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n    credential=AzureKeyCredential(os.environ[\"AZURE_COGNITIVE_SERVICES_KEY\"]),\n    model=\"gpt-4o-realtime-preview\"\n) as conn:\n    ..."
      },
      {
        "title": "Quick Start",
        "body": "import asyncio\nimport os\nfrom azure.ai.voicelive.aio import connect\nfrom azure.identity.aio import DefaultAzureCredential\n\nasync def main():\n    async with connect(\n        endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n        credential=DefaultAzureCredential(),\n        model=\"gpt-4o-realtime-preview\",\n        credential_scopes=[\"https://cognitiveservices.azure.com/.default\"]\n    ) as conn:\n        # Update session with instructions\n        await conn.session.update(session={\n            \"instructions\": \"You are a helpful assistant.\",\n            \"modalities\": [\"text\", \"audio\"],\n            \"voice\": \"alloy\"\n        })\n        \n        # Listen for events\n        async for event in conn:\n            print(f\"Event: {event.type}\")\n            if event.type == \"response.audio_transcript.done\":\n                print(f\"Transcript: {event.transcript}\")\n            elif event.type == \"response.done\":\n                break\n\nasyncio.run(main())"
      },
      {
        "title": "Connection Resources",
        "body": "The VoiceLiveConnection exposes these resources:\n\nResourcePurposeKey Methodsconn.sessionSession configurationupdate(session=...)conn.responseModel responsescreate(), cancel()conn.input_audio_bufferAudio inputappend(), commit(), clear()conn.output_audio_bufferAudio outputclear()conn.conversationConversation stateitem.create(), item.delete(), item.truncate()conn.transcription_sessionTranscription configupdate(session=...)"
      },
      {
        "title": "Session Configuration",
        "body": "from azure.ai.voicelive.models import RequestSession, FunctionTool\n\nawait conn.session.update(session=RequestSession(\n    instructions=\"You are a helpful voice assistant.\",\n    modalities=[\"text\", \"audio\"],\n    voice=\"alloy\",  # or \"echo\", \"shimmer\", \"sage\", etc.\n    input_audio_format=\"pcm16\",\n    output_audio_format=\"pcm16\",\n    turn_detection={\n        \"type\": \"server_vad\",\n        \"threshold\": 0.5,\n        \"prefix_padding_ms\": 300,\n        \"silence_duration_ms\": 500\n    },\n    tools=[\n        FunctionTool(\n            type=\"function\",\n            name=\"get_weather\",\n            description=\"Get current weather\",\n            parameters={\n                \"type\": \"object\",\n                \"properties\": {\n                    \"location\": {\"type\": \"string\"}\n                },\n                \"required\": [\"location\"]\n            }\n        )\n    ]\n))"
      },
      {
        "title": "Send Audio (Base64 PCM16)",
        "body": "import base64\n\n# Read audio chunk (16-bit PCM, 24kHz mono)\naudio_chunk = await read_audio_from_microphone()\nb64_audio = base64.b64encode(audio_chunk).decode()\n\nawait conn.input_audio_buffer.append(audio=b64_audio)"
      },
      {
        "title": "Receive Audio",
        "body": "async for event in conn:\n    if event.type == \"response.audio.delta\":\n        audio_bytes = base64.b64decode(event.delta)\n        await play_audio(audio_bytes)\n    elif event.type == \"response.audio.done\":\n        print(\"Audio complete\")"
      },
      {
        "title": "Event Handling",
        "body": "async for event in conn:\n    match event.type:\n        # Session events\n        case \"session.created\":\n            print(f\"Session: {event.session}\")\n        case \"session.updated\":\n            print(\"Session updated\")\n        \n        # Audio input events\n        case \"input_audio_buffer.speech_started\":\n            print(f\"Speech started at {event.audio_start_ms}ms\")\n        case \"input_audio_buffer.speech_stopped\":\n            print(f\"Speech stopped at {event.audio_end_ms}ms\")\n        \n        # Transcription events\n        case \"conversation.item.input_audio_transcription.completed\":\n            print(f\"User said: {event.transcript}\")\n        case \"conversation.item.input_audio_transcription.delta\":\n            print(f\"Partial: {event.delta}\")\n        \n        # Response events\n        case \"response.created\":\n            print(f\"Response started: {event.response.id}\")\n        case \"response.audio_transcript.delta\":\n            print(event.delta, end=\"\", flush=True)\n        case \"response.audio.delta\":\n            audio = base64.b64decode(event.delta)\n        case \"response.done\":\n            print(f\"Response complete: {event.response.status}\")\n        \n        # Function calls\n        case \"response.function_call_arguments.done\":\n            result = handle_function(event.name, event.arguments)\n            await conn.conversation.item.create(item={\n                \"type\": \"function_call_output\",\n                \"call_id\": event.call_id,\n                \"output\": json.dumps(result)\n            })\n            await conn.response.create()\n        \n        # Errors\n        case \"error\":\n            print(f\"Error: {event.error.message}\")"
      },
      {
        "title": "Manual Turn Mode (No VAD)",
        "body": "await conn.session.update(session={\"turn_detection\": None})\n\n# Manually control turns\nawait conn.input_audio_buffer.append(audio=b64_audio)\nawait conn.input_audio_buffer.commit()  # End of user turn\nawait conn.response.create()  # Trigger response"
      },
      {
        "title": "Interrupt Handling",
        "body": "async for event in conn:\n    if event.type == \"input_audio_buffer.speech_started\":\n        # User interrupted - cancel current response\n        await conn.response.cancel()\n        await conn.output_audio_buffer.clear()"
      },
      {
        "title": "Conversation History",
        "body": "# Add system message\nawait conn.conversation.item.create(item={\n    \"type\": \"message\",\n    \"role\": \"system\",\n    \"content\": [{\"type\": \"input_text\", \"text\": \"Be concise.\"}]\n})\n\n# Add user message\nawait conn.conversation.item.create(item={\n    \"type\": \"message\",\n    \"role\": \"user\", \n    \"content\": [{\"type\": \"input_text\", \"text\": \"Hello!\"}]\n})\n\nawait conn.response.create()"
      },
      {
        "title": "Voice Options",
        "body": "VoiceDescriptionalloyNeutral, balancedechoWarm, conversationalshimmerClear, professionalsageCalm, authoritativecoralFriendly, upbeatashDeep, measuredballadExpressiveverseStorytelling\n\nAzure voices: Use AzureStandardVoice, AzureCustomVoice, or AzurePersonalVoice models."
      },
      {
        "title": "Audio Formats",
        "body": "FormatSample RateUse Casepcm1624kHzDefault, high qualitypcm16-8000hz8kHzTelephonypcm16-16000hz16kHzVoice assistantsg711_ulaw8kHzTelephony (US)g711_alaw8kHzTelephony (EU)"
      },
      {
        "title": "Turn Detection Options",
        "body": "# Server VAD (default)\n{\"type\": \"server_vad\", \"threshold\": 0.5, \"silence_duration_ms\": 500}\n\n# Azure Semantic VAD (smarter detection)\n{\"type\": \"azure_semantic_vad\"}\n{\"type\": \"azure_semantic_vad_en\"}  # English optimized\n{\"type\": \"azure_semantic_vad_multilingual\"}"
      },
      {
        "title": "Error Handling",
        "body": "from azure.ai.voicelive.aio import ConnectionError, ConnectionClosed\n\ntry:\n    async with connect(...) as conn:\n        async for event in conn:\n            if event.type == \"error\":\n                print(f\"API Error: {event.error.code} - {event.error.message}\")\nexcept ConnectionClosed as e:\n    print(f\"Connection closed: {e.code} - {e.reason}\")\nexcept ConnectionError as e:\n    print(f\"Connection error: {e}\")"
      },
      {
        "title": "References",
        "body": "Detailed API Reference: See references/api-reference.md\nComplete Examples: See references/examples.md\nAll Models & Types: See references/models.md"
      }
    ],
    "body": "Azure AI Voice Live SDK\n\nBuild real-time voice AI applications with bidirectional WebSocket communication.\n\nInstallation\npip install azure-ai-voicelive aiohttp azure-identity\n\nEnvironment Variables\nAZURE_COGNITIVE_SERVICES_ENDPOINT=https://<region>.api.cognitive.microsoft.com\n# For API key auth (not recommended for production)\nAZURE_COGNITIVE_SERVICES_KEY=<api-key>\n\nAuthentication\n\nDefaultAzureCredential (preferred):\n\nfrom azure.ai.voicelive.aio import connect\nfrom azure.identity.aio import DefaultAzureCredential\n\nasync with connect(\n    endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n    credential=DefaultAzureCredential(),\n    model=\"gpt-4o-realtime-preview\",\n    credential_scopes=[\"https://cognitiveservices.azure.com/.default\"]\n) as conn:\n    ...\n\n\nAPI Key:\n\nfrom azure.ai.voicelive.aio import connect\nfrom azure.core.credentials import AzureKeyCredential\n\nasync with connect(\n    endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n    credential=AzureKeyCredential(os.environ[\"AZURE_COGNITIVE_SERVICES_KEY\"]),\n    model=\"gpt-4o-realtime-preview\"\n) as conn:\n    ...\n\nQuick Start\nimport asyncio\nimport os\nfrom azure.ai.voicelive.aio import connect\nfrom azure.identity.aio import DefaultAzureCredential\n\nasync def main():\n    async with connect(\n        endpoint=os.environ[\"AZURE_COGNITIVE_SERVICES_ENDPOINT\"],\n        credential=DefaultAzureCredential(),\n        model=\"gpt-4o-realtime-preview\",\n        credential_scopes=[\"https://cognitiveservices.azure.com/.default\"]\n    ) as conn:\n        # Update session with instructions\n        await conn.session.update(session={\n            \"instructions\": \"You are a helpful assistant.\",\n            \"modalities\": [\"text\", \"audio\"],\n            \"voice\": \"alloy\"\n        })\n        \n        # Listen for events\n        async for event in conn:\n            print(f\"Event: {event.type}\")\n            if event.type == \"response.audio_transcript.done\":\n                print(f\"Transcript: {event.transcript}\")\n            elif event.type == \"response.done\":\n                break\n\nasyncio.run(main())\n\nCore Architecture\nConnection Resources\n\nThe VoiceLiveConnection exposes these resources:\n\nResource\tPurpose\tKey Methods\nconn.session\tSession configuration\tupdate(session=...)\nconn.response\tModel responses\tcreate(), cancel()\nconn.input_audio_buffer\tAudio input\tappend(), commit(), clear()\nconn.output_audio_buffer\tAudio output\tclear()\nconn.conversation\tConversation state\titem.create(), item.delete(), item.truncate()\nconn.transcription_session\tTranscription config\tupdate(session=...)\nSession Configuration\nfrom azure.ai.voicelive.models import RequestSession, FunctionTool\n\nawait conn.session.update(session=RequestSession(\n    instructions=\"You are a helpful voice assistant.\",\n    modalities=[\"text\", \"audio\"],\n    voice=\"alloy\",  # or \"echo\", \"shimmer\", \"sage\", etc.\n    input_audio_format=\"pcm16\",\n    output_audio_format=\"pcm16\",\n    turn_detection={\n        \"type\": \"server_vad\",\n        \"threshold\": 0.5,\n        \"prefix_padding_ms\": 300,\n        \"silence_duration_ms\": 500\n    },\n    tools=[\n        FunctionTool(\n            type=\"function\",\n            name=\"get_weather\",\n            description=\"Get current weather\",\n            parameters={\n                \"type\": \"object\",\n                \"properties\": {\n                    \"location\": {\"type\": \"string\"}\n                },\n                \"required\": [\"location\"]\n            }\n        )\n    ]\n))\n\nAudio Streaming\nSend Audio (Base64 PCM16)\nimport base64\n\n# Read audio chunk (16-bit PCM, 24kHz mono)\naudio_chunk = await read_audio_from_microphone()\nb64_audio = base64.b64encode(audio_chunk).decode()\n\nawait conn.input_audio_buffer.append(audio=b64_audio)\n\nReceive Audio\nasync for event in conn:\n    if event.type == \"response.audio.delta\":\n        audio_bytes = base64.b64decode(event.delta)\n        await play_audio(audio_bytes)\n    elif event.type == \"response.audio.done\":\n        print(\"Audio complete\")\n\nEvent Handling\nasync for event in conn:\n    match event.type:\n        # Session events\n        case \"session.created\":\n            print(f\"Session: {event.session}\")\n        case \"session.updated\":\n            print(\"Session updated\")\n        \n        # Audio input events\n        case \"input_audio_buffer.speech_started\":\n            print(f\"Speech started at {event.audio_start_ms}ms\")\n        case \"input_audio_buffer.speech_stopped\":\n            print(f\"Speech stopped at {event.audio_end_ms}ms\")\n        \n        # Transcription events\n        case \"conversation.item.input_audio_transcription.completed\":\n            print(f\"User said: {event.transcript}\")\n        case \"conversation.item.input_audio_transcription.delta\":\n            print(f\"Partial: {event.delta}\")\n        \n        # Response events\n        case \"response.created\":\n            print(f\"Response started: {event.response.id}\")\n        case \"response.audio_transcript.delta\":\n            print(event.delta, end=\"\", flush=True)\n        case \"response.audio.delta\":\n            audio = base64.b64decode(event.delta)\n        case \"response.done\":\n            print(f\"Response complete: {event.response.status}\")\n        \n        # Function calls\n        case \"response.function_call_arguments.done\":\n            result = handle_function(event.name, event.arguments)\n            await conn.conversation.item.create(item={\n                \"type\": \"function_call_output\",\n                \"call_id\": event.call_id,\n                \"output\": json.dumps(result)\n            })\n            await conn.response.create()\n        \n        # Errors\n        case \"error\":\n            print(f\"Error: {event.error.message}\")\n\nCommon Patterns\nManual Turn Mode (No VAD)\nawait conn.session.update(session={\"turn_detection\": None})\n\n# Manually control turns\nawait conn.input_audio_buffer.append(audio=b64_audio)\nawait conn.input_audio_buffer.commit()  # End of user turn\nawait conn.response.create()  # Trigger response\n\nInterrupt Handling\nasync for event in conn:\n    if event.type == \"input_audio_buffer.speech_started\":\n        # User interrupted - cancel current response\n        await conn.response.cancel()\n        await conn.output_audio_buffer.clear()\n\nConversation History\n# Add system message\nawait conn.conversation.item.create(item={\n    \"type\": \"message\",\n    \"role\": \"system\",\n    \"content\": [{\"type\": \"input_text\", \"text\": \"Be concise.\"}]\n})\n\n# Add user message\nawait conn.conversation.item.create(item={\n    \"type\": \"message\",\n    \"role\": \"user\", \n    \"content\": [{\"type\": \"input_text\", \"text\": \"Hello!\"}]\n})\n\nawait conn.response.create()\n\nVoice Options\nVoice\tDescription\nalloy\tNeutral, balanced\necho\tWarm, conversational\nshimmer\tClear, professional\nsage\tCalm, authoritative\ncoral\tFriendly, upbeat\nash\tDeep, measured\nballad\tExpressive\nverse\tStorytelling\n\nAzure voices: Use AzureStandardVoice, AzureCustomVoice, or AzurePersonalVoice models.\n\nAudio Formats\nFormat\tSample Rate\tUse Case\npcm16\t24kHz\tDefault, high quality\npcm16-8000hz\t8kHz\tTelephony\npcm16-16000hz\t16kHz\tVoice assistants\ng711_ulaw\t8kHz\tTelephony (US)\ng711_alaw\t8kHz\tTelephony (EU)\nTurn Detection Options\n# Server VAD (default)\n{\"type\": \"server_vad\", \"threshold\": 0.5, \"silence_duration_ms\": 500}\n\n# Azure Semantic VAD (smarter detection)\n{\"type\": \"azure_semantic_vad\"}\n{\"type\": \"azure_semantic_vad_en\"}  # English optimized\n{\"type\": \"azure_semantic_vad_multilingual\"}\n\nError Handling\nfrom azure.ai.voicelive.aio import ConnectionError, ConnectionClosed\n\ntry:\n    async with connect(...) as conn:\n        async for event in conn:\n            if event.type == \"error\":\n                print(f\"API Error: {event.error.code} - {event.error.message}\")\nexcept ConnectionClosed as e:\n    print(f\"Connection closed: {e.code} - {e.reason}\")\nexcept ConnectionError as e:\n    print(f\"Connection error: {e}\")\n\nReferences\nDetailed API Reference: See references/api-reference.md\nComplete Examples: See references/examples.md\nAll Models & Types: See references/models.md"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/thegovind/azure-ai-voicelive-py",
    "publisherUrl": "https://clawhub.ai/thegovind/azure-ai-voicelive-py",
    "owner": "thegovind",
    "version": "0.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py",
    "downloadUrl": "https://openagent3.xyz/downloads/azure-ai-voicelive-py",
    "agentUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent",
    "manifestUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/azure-ai-voicelive-py/agent.md"
  }
}