{
  "schemaVersion": "1.0",
  "item": {
    "slug": "audio-speaker-tools",
    "name": "Audio Speaker Tools",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/cmfinlan/audio-speaker-tools",
    "canonicalUrl": "https://clawhub.ai/cmfinlan/audio-speaker-tools",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/audio-speaker-tools",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=audio-speaker-tools",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "references/elevenlabs-cloning.md",
      "references/scoring-guide.md",
      "scripts/compare_voices.py",
      "scripts/diarize_and_slice_mps.py",
      "scripts/setup_venv.sh"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/audio-speaker-tools"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/audio-speaker-tools",
    "agentPageUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent",
    "manifestUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Audio Speaker Tools",
        "body": "Tools for speaker separation, voice comparison, and audio processing using Demucs, pyannote, and Resemblyzer."
      },
      {
        "title": "Overview",
        "body": "This skill provides three main workflows:\n\nSpeaker separation - Extract per-speaker audio from multi-speaker recordings\nVoice comparison - Measure speaker similarity between two audio files\nAudio processing - Segment extraction and voice isolation"
      },
      {
        "title": "Setup Virtual Environment",
        "body": "Run once to create the venv and install dependencies:\n\nbash scripts/setup_venv.sh\n\nDefault venv location: ./.venv\n\nRequirements:\n\nPython 3.9+\nffmpeg (brew install ffmpeg)\nHuggingFace token (set as env var HF_TOKEN)"
      },
      {
        "title": "1. Speaker Separation: diarize_and_slice_mps.py",
        "body": "Separate speakers from multi-speaker audio:\n\n# Basic usage\nHF_TOKEN=<your-hf-token> \\\n  /path/to/venv/bin/python scripts/diarize_and_slice_mps.py \\\n  --input audio.mp3 \\\n  --outdir /path/to/output \\\n  --prefix MyShow\n\n# With speaker constraints\nHF_TOKEN=$TOKEN python scripts/diarize_and_slice_mps.py \\\n  --input audio.mp3 \\\n  --outdir ./out \\\n  --min-speakers 2 \\\n  --max-speakers 5 \\\n  --pad-ms 100\n\nProcess:\n\nConverts input to 16kHz mono WAV\nRuns Demucs vocal/background separation (optional, for cleaner input)\nRuns pyannote speaker diarization (MPS-accelerated)\nExtracts concatenated per-speaker WAV files\n\nOutput:\n\n<prefix>_speaker1.wav, <prefix>_speaker2.wav, etc. (one per detected speaker)\ndiarization.rttm (time-stamped speaker segments)\nsegments.jsonl (JSON segments metadata)\nmeta.json (pipeline info and speaker index)\n\nImportant:\n\nAlways pass HF token via HF_TOKEN env var, never as CLI arg\nMPS first, CPU fallback - Script prefers Metal GPU, falls back to CPU if unavailable\nDefault output: ./separated/"
      },
      {
        "title": "2. Voice Comparison: compare_voices.py",
        "body": "Measure similarity between two voice samples using Resemblyzer:\n\n# Basic comparison\npython scripts/compare_voices.py \\\n  --audio1 sample1.wav \\\n  --audio2 sample2.wav\n\n# JSON output\npython scripts/compare_voices.py \\\n  --audio1 reference.wav \\\n  --audio2 clone.wav \\\n  --threshold 0.85 \\\n  --json\n\n# Exit code = 0 if pass, 1 if fail\n\nScores:\n\n< 0.75 = Different speakers\n0.75-0.84 = Likely same speaker\n0.85+ = Excellent match (ideal for voice cloning validation)\n\nUse cases:\n\nVoice clone quality assessment (compare clone vs. original)\nSpeaker verification (authenticate speaker identity)\nValidate speaker separation (confirm separated speakers are distinct)\n\nSee: references/scoring-guide.md for detailed interpretation"
      },
      {
        "title": "3. Audio Trimming",
        "body": "Use ffmpeg directly for segment extraction:\n\n# Extract 10-second segment starting at 5 seconds\nffmpeg -i input.mp3 -ss 5 -t 10 -c copy output.mp3\n\n# Extract vocals only with Demucs (before diarization)\ndemucs --two-stems vocals --out ./separated input.mp3"
      },
      {
        "title": "Workflow 1: Extract Clean Voice Sample for Cloning",
        "body": "Goal: Get a clean, single-speaker sample for ElevenLabs voice cloning\n\n# 1. Separate speakers\nHF_TOKEN=<your-hf-token> python scripts/diarize_and_slice_mps.py \\\n  --input podcast.mp3 --outdir ./out --prefix Podcast\n\n# 2. Review speaker files (out/Podcast_speaker1.wav, etc.)\n\n# 3. Select best sample (5-30s, clean speech)\nffmpeg -i out/Podcast_speaker2.wav -ss 10 -t 20 -c copy sample.wav\n\n# 4. Upload to ElevenLabs as instant voice clone\n\nSee: references/elevenlabs-cloning.md for best practices"
      },
      {
        "title": "Workflow 2: Validate Voice Clone Quality",
        "body": "Goal: Measure how well a cloned voice matches the original\n\n# 1. Generate test audio with ElevenLabs clone\n# (done via ElevenLabs web UI or API)\n\n# 2. Compare clone vs. reference\npython scripts/compare_voices.py \\\n  --audio1 original_sample.wav \\\n  --audio2 elevenlabs_clone.wav \\\n  --threshold 0.85 \\\n  --json\n\n# 3. Interpret score:\n#    0.85+ = excellent, publish-ready\n#    0.80-0.84 = acceptable, may need tweaking\n#    < 0.80 = poor, try different sample or settings\n\nSee: references/scoring-guide.md for troubleshooting low scores"
      },
      {
        "title": "Workflow 3: Multi-Speaker Conversation Analysis",
        "body": "Goal: Separate and identify speakers in a conversation\n\n# 1. Run diarization\nHF_TOKEN=$TOKEN python scripts/diarize_and_slice_mps.py \\\n  --input meeting.mp3 --outdir ./out --prefix Meeting\n\n# 2. Check detected speakers (meta.json)\ncat out/meta.json\n\n# 3. Compare speaker pairs to confirm separation\npython scripts/compare_voices.py \\\n  --audio1 out/Meeting_speaker1.wav \\\n  --audio2 out/Meeting_speaker2.wav\n\n# Expected: < 0.75 if separation worked correctly"
      },
      {
        "title": "Device Acceleration",
        "body": "pyannote diarization: MPS (Metal) by default, CPU fallback\nResemblyzer: CPU only (no GPU acceleration)\nDemucs: MPS by default when available\n\nTo force CPU for diarization: --device cpu"
      },
      {
        "title": "Audio Formats",
        "body": "Input: Any format supported by ffmpeg (wav, mp3, flac, m4a, etc.)\nProcessing: Internally converted to 16kHz mono WAV for diarization\nOutput: WAV format (44.1kHz stereo preserved from source)"
      },
      {
        "title": "HuggingFace Token",
        "body": "Required for: pyannote speaker diarization\nAccess: Must accept gated repo pyannote/speaker-diarization-3.1 on HF\nStorage: Any secure secrets manager\nUsage: Always pass via HF_TOKEN env var, never CLI arg"
      },
      {
        "title": "Sample Quality Tips",
        "body": "Shorter is better: 5-30s clean samples often score higher than 60+ second samples\nClean audio: Remove background noise with Demucs --two-stems vocals\nSingle speaker: Ensure isolated voice, not mixed conversation\nGood recording: Studio mic > phone mic for voice comparison accuracy"
      },
      {
        "title": "References",
        "body": "elevenlabs-cloning.md - Best practices for ElevenLabs instant voice cloning (model settings, sample selection, proven configurations)\nscoring-guide.md - How to interpret Resemblyzer similarity scores (thresholds, use cases, troubleshooting)"
      },
      {
        "title": "\"Missing HF token\" error",
        "body": "Export token before running: export HF_TOKEN=<your-token>\nOr pass inline: HF_TOKEN=<your-token> python script.py ..."
      },
      {
        "title": "Low voice comparison scores for same speaker",
        "body": "Try shorter, cleaner samples (5-30s)\nUse Demucs to isolate vocals: demucs --two-stems vocals input.mp3\nEnsure consistent recording quality (same mic, environment)\nSee references/scoring-guide.md troubleshooting section"
      },
      {
        "title": "Diarization not detecting all speakers",
        "body": "Adjust --min-speakers and --max-speakers flags\nCheck audio quality (clear speech, minimal overlap)\nTry longer audio (30+ seconds) for better speaker modeling"
      },
      {
        "title": "MPS/Metal acceleration not working",
        "body": "Ensure PyTorch with MPS support: python -c \"import torch; print(torch.backends.mps.is_available())\"\nFallback to CPU: --device cpu\nRe-run setup_venv.sh to reinstall PyTorch"
      }
    ],
    "body": "Audio Speaker Tools\n\nTools for speaker separation, voice comparison, and audio processing using Demucs, pyannote, and Resemblyzer.\n\nOverview\n\nThis skill provides three main workflows:\n\nSpeaker separation - Extract per-speaker audio from multi-speaker recordings\nVoice comparison - Measure speaker similarity between two audio files\nAudio processing - Segment extraction and voice isolation\nPrerequisites\nSetup Virtual Environment\n\nRun once to create the venv and install dependencies:\n\nbash scripts/setup_venv.sh\n\n\nDefault venv location: ./.venv\n\nRequirements:\n\nPython 3.9+\nffmpeg (brew install ffmpeg)\nHuggingFace token (set as env var HF_TOKEN)\nScripts\n1. Speaker Separation: diarize_and_slice_mps.py\n\nSeparate speakers from multi-speaker audio:\n\n# Basic usage\nHF_TOKEN=<your-hf-token> \\\n  /path/to/venv/bin/python scripts/diarize_and_slice_mps.py \\\n  --input audio.mp3 \\\n  --outdir /path/to/output \\\n  --prefix MyShow\n\n# With speaker constraints\nHF_TOKEN=$TOKEN python scripts/diarize_and_slice_mps.py \\\n  --input audio.mp3 \\\n  --outdir ./out \\\n  --min-speakers 2 \\\n  --max-speakers 5 \\\n  --pad-ms 100\n\n\nProcess:\n\nConverts input to 16kHz mono WAV\nRuns Demucs vocal/background separation (optional, for cleaner input)\nRuns pyannote speaker diarization (MPS-accelerated)\nExtracts concatenated per-speaker WAV files\n\nOutput:\n\n<prefix>_speaker1.wav, <prefix>_speaker2.wav, etc. (one per detected speaker)\ndiarization.rttm (time-stamped speaker segments)\nsegments.jsonl (JSON segments metadata)\nmeta.json (pipeline info and speaker index)\n\nImportant:\n\nAlways pass HF token via HF_TOKEN env var, never as CLI arg\nMPS first, CPU fallback - Script prefers Metal GPU, falls back to CPU if unavailable\nDefault output: ./separated/\n2. Voice Comparison: compare_voices.py\n\nMeasure similarity between two voice samples using Resemblyzer:\n\n# Basic comparison\npython scripts/compare_voices.py \\\n  --audio1 sample1.wav \\\n  --audio2 sample2.wav\n\n# JSON output\npython scripts/compare_voices.py \\\n  --audio1 reference.wav \\\n  --audio2 clone.wav \\\n  --threshold 0.85 \\\n  --json\n\n# Exit code = 0 if pass, 1 if fail\n\n\nScores:\n\n< 0.75 = Different speakers\n0.75-0.84 = Likely same speaker\n0.85+ = Excellent match (ideal for voice cloning validation)\n\nUse cases:\n\nVoice clone quality assessment (compare clone vs. original)\nSpeaker verification (authenticate speaker identity)\nValidate speaker separation (confirm separated speakers are distinct)\n\nSee: references/scoring-guide.md for detailed interpretation\n\n3. Audio Trimming\n\nUse ffmpeg directly for segment extraction:\n\n# Extract 10-second segment starting at 5 seconds\nffmpeg -i input.mp3 -ss 5 -t 10 -c copy output.mp3\n\n# Extract vocals only with Demucs (before diarization)\ndemucs --two-stems vocals --out ./separated input.mp3\n\nWorkflows\nWorkflow 1: Extract Clean Voice Sample for Cloning\n\nGoal: Get a clean, single-speaker sample for ElevenLabs voice cloning\n\n# 1. Separate speakers\nHF_TOKEN=<your-hf-token> python scripts/diarize_and_slice_mps.py \\\n  --input podcast.mp3 --outdir ./out --prefix Podcast\n\n# 2. Review speaker files (out/Podcast_speaker1.wav, etc.)\n\n# 3. Select best sample (5-30s, clean speech)\nffmpeg -i out/Podcast_speaker2.wav -ss 10 -t 20 -c copy sample.wav\n\n# 4. Upload to ElevenLabs as instant voice clone\n\n\nSee: references/elevenlabs-cloning.md for best practices\n\nWorkflow 2: Validate Voice Clone Quality\n\nGoal: Measure how well a cloned voice matches the original\n\n# 1. Generate test audio with ElevenLabs clone\n# (done via ElevenLabs web UI or API)\n\n# 2. Compare clone vs. reference\npython scripts/compare_voices.py \\\n  --audio1 original_sample.wav \\\n  --audio2 elevenlabs_clone.wav \\\n  --threshold 0.85 \\\n  --json\n\n# 3. Interpret score:\n#    0.85+ = excellent, publish-ready\n#    0.80-0.84 = acceptable, may need tweaking\n#    < 0.80 = poor, try different sample or settings\n\n\nSee: references/scoring-guide.md for troubleshooting low scores\n\nWorkflow 3: Multi-Speaker Conversation Analysis\n\nGoal: Separate and identify speakers in a conversation\n\n# 1. Run diarization\nHF_TOKEN=$TOKEN python scripts/diarize_and_slice_mps.py \\\n  --input meeting.mp3 --outdir ./out --prefix Meeting\n\n# 2. Check detected speakers (meta.json)\ncat out/meta.json\n\n# 3. Compare speaker pairs to confirm separation\npython scripts/compare_voices.py \\\n  --audio1 out/Meeting_speaker1.wav \\\n  --audio2 out/Meeting_speaker2.wav\n\n# Expected: < 0.75 if separation worked correctly\n\nTechnical Notes\nDevice Acceleration\npyannote diarization: MPS (Metal) by default, CPU fallback\nResemblyzer: CPU only (no GPU acceleration)\nDemucs: MPS by default when available\n\nTo force CPU for diarization: --device cpu\n\nAudio Formats\nInput: Any format supported by ffmpeg (wav, mp3, flac, m4a, etc.)\nProcessing: Internally converted to 16kHz mono WAV for diarization\nOutput: WAV format (44.1kHz stereo preserved from source)\nHuggingFace Token\nRequired for: pyannote speaker diarization\nAccess: Must accept gated repo pyannote/speaker-diarization-3.1 on HF\nStorage: Any secure secrets manager\nUsage: Always pass via HF_TOKEN env var, never CLI arg\nSample Quality Tips\nShorter is better: 5-30s clean samples often score higher than 60+ second samples\nClean audio: Remove background noise with Demucs --two-stems vocals\nSingle speaker: Ensure isolated voice, not mixed conversation\nGood recording: Studio mic > phone mic for voice comparison accuracy\nReferences\nelevenlabs-cloning.md - Best practices for ElevenLabs instant voice cloning (model settings, sample selection, proven configurations)\nscoring-guide.md - How to interpret Resemblyzer similarity scores (thresholds, use cases, troubleshooting)\nCommon Issues\n\"Missing HF token\" error\nExport token before running: export HF_TOKEN=<your-token>\nOr pass inline: HF_TOKEN=<your-token> python script.py ...\nLow voice comparison scores for same speaker\nTry shorter, cleaner samples (5-30s)\nUse Demucs to isolate vocals: demucs --two-stems vocals input.mp3\nEnsure consistent recording quality (same mic, environment)\nSee references/scoring-guide.md troubleshooting section\nDiarization not detecting all speakers\nAdjust --min-speakers and --max-speakers flags\nCheck audio quality (clear speech, minimal overlap)\nTry longer audio (30+ seconds) for better speaker modeling\nMPS/Metal acceleration not working\nEnsure PyTorch with MPS support: python -c \"import torch; print(torch.backends.mps.is_available())\"\nFallback to CPU: --device cpu\nRe-run setup_venv.sh to reinstall PyTorch"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/cmfinlan/audio-speaker-tools",
    "publisherUrl": "https://clawhub.ai/cmfinlan/audio-speaker-tools",
    "owner": "cmfinlan",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/audio-speaker-tools",
    "downloadUrl": "https://openagent3.xyz/downloads/audio-speaker-tools",
    "agentUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent",
    "manifestUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/audio-speaker-tools/agent.md"
  }
}