{
  "schemaVersion": "1.0",
  "item": {
    "slug": "audio-gen",
    "name": "Audio Content Generator",
    "source": "tencent",
    "type": "skill",
    "category": "内容创作",
    "sourceUrl": "https://clawhub.ai/udiedrichsen/audio-gen",
    "canonicalUrl": "https://clawhub.ai/udiedrichsen/audio-gen",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/audio-gen",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=audio-gen",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/audio-gen"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/audio-gen",
    "agentPageUrl": "https://openagent3.xyz/skills/audio-gen/agent",
    "manifestUrl": "https://openagent3.xyz/skills/audio-gen/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/audio-gen/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "🎙️ Audio Content Generator",
        "body": "Generate high-quality audiobooks, podcasts, or educational audio content on demand using AI-written scripts and ElevenLabs text-to-speech."
      },
      {
        "title": "Quick Start",
        "body": "Create an audiobook chapter:\n\nUser: \"Create a 5-minute audiobook chapter about a dragon discovering friendship\"\n\nGenerate a podcast:\n\nUser: \"Make a 10-minute podcast about the history of coffee\"\n\nProduce educational content:\n\nUser: \"Generate a 15-minute educational audio explaining how neural networks work\""
      },
      {
        "title": "Audiobook",
        "body": "Style: Narrative storytelling with emotional depth\n\nClear beginning, middle, and end\nDescriptive language and vivid imagery\nDramatic pacing with thoughtful pauses\nEmotional tone that matches the story\nUse voice effects like [whispers], [excited], [serious] for impact\n\nExample Structure:\n\n[Opening hook - set the scene]\n[long pause]\n\n[Story development with character emotions]\n[short pause] between sentences\n[long pause] between paragraphs\n\n[Climax with dramatic tension]\n[long pause]\n\n[Resolution and emotional closure]"
      },
      {
        "title": "Podcast",
        "body": "Style: Conversational and engaging\n\nWarm, welcoming intro (15-30 seconds)\nMain content with natural flow\nTransitions between topics\nMemorable outro with key takeaways\nConversational tone throughout\n\nExample Structure:\n\n**Intro:** \"Welcome to [topic]. I'm excited to share...\"\n[short pause]\n\n**Main Content:** \"Let's start with... [topic 1]\"\n[long pause] between segments\n\n**Outro:** \"Thanks for listening! Remember...\""
      },
      {
        "title": "Educational Content",
        "body": "Style: Clear explanations for learning\n\nSimple introductions to complex topics\nStep-by-step breakdowns\nReal-world examples and analogies\nRecap of key concepts at the end\nEnthusiastic delivery with [excited] for important points\n\nExample Structure:\n\n**Introduction:** What is [topic] and why it matters?\n\n**Main Content:**\n- Concept 1: Explanation + Example\n- Concept 2: Explanation + Example\n- Concept 3: Explanation + Example\n\n**Summary:** Key takeaways and next steps"
      },
      {
        "title": "Length Guidelines",
        "body": "Word Count to Duration Conversion:\n\n5 minutes = ~375 words\n10 minutes = ~750 words\n15 minutes = ~1,125 words\n20 minutes = ~1,500 words\n30 minutes = ~2,250 words\n\nPacing: Average conversational speed is ~75 words per minute\n\nPractical Limits:\n\nMinimum: 2 minutes (~150 words)\nMaximum: 30 minutes (~2,250 words)\nSweet spot: 5-15 minutes for best engagement"
      },
      {
        "title": "Step 1: Understand the Request",
        "body": "Parse the user's request for:\n\nContent type (audiobook, podcast, educational, or inferred from topic)\nTopic/theme (what should the content be about)\nTarget length (how many minutes)\nTone/style (dramatic, casual, educational, etc.)\nSpecial requests (specific voice, emphasis on certain points)"
      },
      {
        "title": "Step 2: Calculate Word Count",
        "body": "target_words = target_minutes × 75\n\nExample: 10 minutes = 10 × 75 = 750 words"
      },
      {
        "title": "Step 3: Generate the Script",
        "body": "Write the complete script following these rules:\n\nContent Guidelines:\n\nStart strong with an engaging hook\nMaintain natural, conversational flow\nUse active voice and simple sentence structure\nInclude relevant examples and stories\nEnd with a satisfying conclusion\n\nFormatting Rules:\n\nAdd [short pause] after sentences (use sparingly, not every sentence)\nAdd [long pause] between paragraphs or major sections\nUse voice effects strategically: [whispers], [shouts], [excited], [serious], [sarcastic], [sings], [laughs]\nWrite numbers as words: \"twenty-three\" not \"23\"\nSpell out acronyms first time: \"AI, or artificial intelligence\"\nAvoid complex punctuation (em-dashes work, but semicolons don't read well)\nRemove markdown formatting before TTS conversion"
      },
      {
        "title": "Step 4: Present the Script",
        "body": "Show the script to the user and ask:\n\nHere's the [format] script I've created (approximately [length] minutes):\n\n[Display the script]\n\nWould you like me to:\n1. Generate the audio now\n2. Make changes to the script\n3. Adjust the length or tone"
      },
      {
        "title": "Step 5: Handle User Feedback",
        "body": "If user requests changes:\n\nRegenerate the script with adjustments\nMaintain the target word count\nPresent the revised version\n\nIf user approves:\n\nProceed to audio generation"
      },
      {
        "title": "Step 6: Generate Audio",
        "body": "Format the script for TTS:\n\nRemove any remaining markdown (headers, bold, italics)\nEnsure voice effects are in proper [effect] format\nCheck that pauses are appropriately placed\nVerify numbers and acronyms are spelled out\n\nInvoke the TTS script:\n\nIMPORTANT: The ELEVENLABS_API_KEY environment variable is already configured in the system. Simply invoke the TTS script directly.\n\nuv run /home/clawdbot/clawdbot/skills/sag/scripts/tts.py \\\n  -o /tmp/audio-gen-[timestamp]-[topic-slug].mp3 \\\n  -m eleven_multilingual_v2 \\\n  \"[formatted_script]\"\n\nFor long scripts, use heredoc:\n\nuv run /home/clawdbot/clawdbot/skills/sag/scripts/tts.py \\\n  -o /tmp/audio-gen-[timestamp]-[topic-slug].mp3 \\\n  -m eleven_multilingual_v2 \\\n  \"$(cat <<'EOF'\n[formatted_script]\nEOF\n)\"\n\nReturn the result:\n\nMEDIA:/tmp/audio-gen-[timestamp]-[topic-slug].mp3\n\nYour [format] is ready! [Brief description of content]. Duration: approximately [X] minutes."
      },
      {
        "title": "Voice Effects (SSML Tags)",
        "body": "Available voice modulation effects (use sparingly for impact):\n\n[whispers] - Soft, intimate delivery\n[shouts] - Loud, emphatic delivery\n[excited] - Enthusiastic, energetic tone\n[serious] - Grave, solemn tone\n[sarcastic] - Ironic, mocking tone\n[sings] - Musical, melodic delivery\n[laughs] - Amused, jovial tone\n[short pause] - Brief silence (~0.5s)\n[long pause] - Extended silence (~1-2s)\n\nBest Practices:\n\nUse effects for emotional moments, not every sentence\nPauses are your most powerful tool for pacing\nVoice effects work best in audiobooks and dramatic content\nKeep podcasts and educational content mostly natural"
      },
      {
        "title": "Script Too Long",
        "body": "If the generated script exceeds target by >20%:\n\nThe script I generated is [X] words ([Y] minutes), which is longer than your target of [Z] minutes. Would you like me to:\n1. Condense it to fit the target length\n2. Split it into multiple parts\n3. Keep it as is"
      },
      {
        "title": "Script Too Short",
        "body": "If the generated script is under target by >20%:\n\nThe script is [X] words ([Y] minutes), shorter than your target. Would you like me to:\n1. Expand it with more detail\n2. Add additional examples or stories\n3. Generate as is"
      },
      {
        "title": "TTS Generation Fails",
        "body": "If the TTS script fails:\n\nI've created the script, but I'm unable to generate the audio right now. Here's your script:\n\n[Display script]\n\nError: [specific error message]\n\nYou can:\n1. Check that ELEVENLABS_API_KEY is configured\n2. Use the script with your own text-to-speech tool\n3. Try again in a moment\n4. Ask me to troubleshoot the audio generation\n\nCommon TTS Issues:\n\nAPI key not set: Verify ELEVENLABS_API_KEY in config\nRate limit: Wait a moment and try again\nText too long: Break into smaller chunks (max ~5000 characters)"
      },
      {
        "title": "Invalid Request",
        "body": "For unrealistic requests (e.g., \"100-hour audiobook\"):\n\nThat length would require [X] words and take significant time to generate. I recommend:\n- Breaking it into multiple episodes/chapters\n- Targeting 5-30 minutes per audio file\n- Creating a series instead of one long file"
      },
      {
        "title": "For Engaging Audiobooks",
        "body": "Focus on character emotions and sensory details\nUse pauses to build dramatic tension\nVary sentence length for rhythm\nInclude internal monologue and reflection"
      },
      {
        "title": "For Compelling Podcasts",
        "body": "Start with a question or surprising fact\nUse conversational phrases: \"You know what's interesting...\"\nInclude relatable examples from everyday life\nEnd with actionable takeaways"
      },
      {
        "title": "For Effective Educational Content",
        "body": "Use the \"explain like I'm five\" approach\nBuild from simple to complex concepts\nRepeat key terms and definitions\nProvide multiple examples for clarity"
      },
      {
        "title": "Technical Notes",
        "body": "TTS Implementation:\n\nUses Python script: ~/.clawdbot/clawdbot/skills/sag/scripts/tts.py\nNo binary installation required (pure Python + requests)\nDirectly calls ElevenLabs API\nCompatible with Linux and macOS\n\nFile Storage:\n\nAudio files are saved to /tmp/audio-gen/\nFilename format: audio-gen-[timestamp]-[topic-slug].mp3\nFiles are automatically cleaned up after 24 hours\n\nAPI Requirements:\n\nAnthropic API for script generation (already configured)\nElevenLabs API for text-to-speech (configured via ELEVENLABS_API_KEY)\nBoth services must be configured and have available credits\n\nSupported Models:\n\neleven_multilingual_v2 - Best quality (default)\neleven_turbo_v2 - Faster generation\neleven_turbo_v2_5 - Fastest generation\neleven_multilingual_v1 - Legacy model\n\nCost Estimate:\n\n10-minute audio (~750 words): approximately $1.43\n\nClaude API: ~$0.075\nElevenLabs: ~$1.35\n\n\nLonger content scales proportionally\n\nGeneration Time:\n\nScript generation: 5-30 seconds (depending on length)\nAudio generation: 5-15 seconds (ElevenLabs processing)\nTotal: Usually under 1 minute for 10-minute audio"
      },
      {
        "title": "Limitations",
        "body": "Maximum Length: 30 minutes (~2,250 words) per audio file\n\nFor longer content, create multiple parts/episodes\n\n\n\nSingle Voice: Currently supports one narrator voice\n\nCannot do multi-voice dialogue or character voices\n\n\n\nNo Background Music: Pure voice narration only\n\nNo background music, sound effects, or audio mixing\n\n\n\nReal-time Generation: Each request generates fresh content\n\nNo pre-made templates or cached audio\n\n\n\nLanguage: Primarily English\n\nElevenLabs supports other languages, but content generation optimized for English"
      },
      {
        "title": "Example 1: Quick Audiobook",
        "body": "User: Create a 5-minute audiobook chapter about a lonely robot learning to feel emotions\n\nAgent: I'll create a 5-minute sci-fi audiobook chapter. Let me write the story...\n\n[Generates ~400-word narrative with emotional arc]\n\nHere's the story I've created about Unit-7, a maintenance robot who discovers the meaning of friendship through an unexpected encounter. Would you like me to generate the audio or make changes first?\n\nUser: Generate the audio\n\nAgent: [Invokes SAG with formatted script]\n\nMEDIA:/tmp/audio-gen-1234567890-lonely-robot.mp3\n\nYour audiobook chapter is ready! The story follows Unit-7's emotional awakening through a touching encounter with a young engineer. Duration: approximately 5 minutes."
      },
      {
        "title": "Example 2: Podcast with Revisions",
        "body": "User: Make a 10-minute podcast about sustainable living tips\n\nAgent: [Generates podcast script with intro, 5 practical tips, outro]\n\nHere's a 10-minute podcast with 5 actionable sustainable living tips. The tone is encouraging and practical. Would you like me to:\n1. Generate audio now\n2. Make it more urgent/climate-focused\n3. Add more specific examples\n4. Adjust the length\n\nUser: Add more specific examples for each tip\n\nAgent: [Regenerates with detailed examples]\n\nBetter? I've added specific examples like calculating your carbon footprint, composting at home, and choosing sustainable brands. Ready for audio?\n\nUser: Perfect!\n\nAgent: MEDIA:/tmp/audio-gen-1234567890-sustainable-living.mp3\n\nYour sustainable living podcast is ready with 5 practical tips, each with concrete examples you can start using today! Duration: 10 minutes."
      },
      {
        "title": "Example 3: Educational Content",
        "body": "User: Create a 15-minute educational audio explaining quantum computing for beginners\n\nAgent: I'll create a beginner-friendly 15-minute explanation of quantum computing...\n\n[Generates ~1,100 words with analogies, examples, and clear structure]\n\nScript preview:\n- Introduction: What is quantum computing? (2 min)\n- Classical vs Quantum: The key differences (4 min)\n- Qubits and Superposition: How it works (5 min)\n- Real Applications: What it means for you (3 min)\n- Conclusion: The future of computing (1 min)\n\nShould I generate the audio?\n\nUser: Yes please\n\nAgent: MEDIA:/tmp/audio-gen-1234567890-quantum-computing.mp3\n\nYour educational podcast on quantum computing is ready! It uses everyday analogies to explain qubits, superposition, and real-world applications. Duration: approximately 15 minutes."
      },
      {
        "title": "When to Use This Skill",
        "body": "Use the audio-gen skill when users request:\n\n\"Create an audiobook about...\"\n\"Make a podcast episode about...\"\n\"Generate audio content explaining...\"\n\"Record a story about...\"\n\"Produce an audio guide for...\"\n\"Create a narrated version of...\"\n\"Make an audio lesson about...\"\n\nDo NOT use for:\n\nTranscribing existing audio (use whisper skills instead)\nConverting user-written text to audio (use SAG skill directly)\nReal-time voice interaction (use voice-call plugin)\nMulti-speaker dialogues or interviews (current limitation)"
      },
      {
        "title": "Voice Selection",
        "body": "Users can request specific voices:\n\nUser: Create a podcast with the Rachel voice\n\nAgent: [Checks voice configuration for \"Rachel\" voice ID and uses it in SAG command]"
      },
      {
        "title": "Series/Episodes",
        "body": "For multi-part content:\n\nUser: Create a 3-episode series about space exploration\n\nAgent: I'll create Episode 1 first. Each episode will be 10 minutes. Let's start with \"The Dawn of the Space Age\"...\n\n[After completion]\n\nEpisode 1 is ready! Would you like me to continue with Episode 2?"
      },
      {
        "title": "Format Blending",
        "body": "Mix formats for unique styles:\n\nUser: Create an educational podcast that tells a story\n\nAgent: [Generates content that combines storytelling narrative with educational explanations]"
      },
      {
        "title": "Troubleshooting",
        "body": "Issue: Audio sounds robotic or unnatural\nSolution: Add more pauses and voice effects. Use contractions and conversational language.\n\nIssue: Script doesn't match requested length\nSolution: Regenerate with explicit word count target. Check calculations (75 words/min).\n\nIssue: Content is too technical or too simple\nSolution: Ask user for target audience. Adjust complexity accordingly.\n\nIssue: SAG command fails\nSolution: Check ELEVENLABS_API_KEY is set. Verify SAG skill is installed and working.\n\nIssue: User wants to edit the script manually\nSolution: Provide the plain text script. User can modify it and paste back for audio generation.\n\n💡 Pro Tip: Always generate the script first and get user approval before creating audio. This saves time and API costs, and ensures the user gets exactly what they want."
      }
    ],
    "body": "🎙️ Audio Content Generator\n\nGenerate high-quality audiobooks, podcasts, or educational audio content on demand using AI-written scripts and ElevenLabs text-to-speech.\n\nQuick Start\n\nCreate an audiobook chapter:\n\nUser: \"Create a 5-minute audiobook chapter about a dragon discovering friendship\"\n\n\nGenerate a podcast:\n\nUser: \"Make a 10-minute podcast about the history of coffee\"\n\n\nProduce educational content:\n\nUser: \"Generate a 15-minute educational audio explaining how neural networks work\"\n\nContent Formats\nAudiobook\n\nStyle: Narrative storytelling with emotional depth\n\nClear beginning, middle, and end\nDescriptive language and vivid imagery\nDramatic pacing with thoughtful pauses\nEmotional tone that matches the story\nUse voice effects like [whispers], [excited], [serious] for impact\n\nExample Structure:\n\n[Opening hook - set the scene]\n[long pause]\n\n[Story development with character emotions]\n[short pause] between sentences\n[long pause] between paragraphs\n\n[Climax with dramatic tension]\n[long pause]\n\n[Resolution and emotional closure]\n\nPodcast\n\nStyle: Conversational and engaging\n\nWarm, welcoming intro (15-30 seconds)\nMain content with natural flow\nTransitions between topics\nMemorable outro with key takeaways\nConversational tone throughout\n\nExample Structure:\n\n**Intro:** \"Welcome to [topic]. I'm excited to share...\"\n[short pause]\n\n**Main Content:** \"Let's start with... [topic 1]\"\n[long pause] between segments\n\n**Outro:** \"Thanks for listening! Remember...\"\n\nEducational Content\n\nStyle: Clear explanations for learning\n\nSimple introductions to complex topics\nStep-by-step breakdowns\nReal-world examples and analogies\nRecap of key concepts at the end\nEnthusiastic delivery with [excited] for important points\n\nExample Structure:\n\n**Introduction:** What is [topic] and why it matters?\n\n**Main Content:**\n- Concept 1: Explanation + Example\n- Concept 2: Explanation + Example\n- Concept 3: Explanation + Example\n\n**Summary:** Key takeaways and next steps\n\nLength Guidelines\n\nWord Count to Duration Conversion:\n\n5 minutes = ~375 words\n10 minutes = ~750 words\n15 minutes = ~1,125 words\n20 minutes = ~1,500 words\n30 minutes = ~2,250 words\n\nPacing: Average conversational speed is ~75 words per minute\n\nPractical Limits:\n\nMinimum: 2 minutes (~150 words)\nMaximum: 30 minutes (~2,250 words)\nSweet spot: 5-15 minutes for best engagement\nWorkflow Instructions\nStep 1: Understand the Request\n\nParse the user's request for:\n\nContent type (audiobook, podcast, educational, or inferred from topic)\nTopic/theme (what should the content be about)\nTarget length (how many minutes)\nTone/style (dramatic, casual, educational, etc.)\nSpecial requests (specific voice, emphasis on certain points)\nStep 2: Calculate Word Count\ntarget_words = target_minutes × 75\n\n\nExample: 10 minutes = 10 × 75 = 750 words\n\nStep 3: Generate the Script\n\nWrite the complete script following these rules:\n\nContent Guidelines:\n\nStart strong with an engaging hook\nMaintain natural, conversational flow\nUse active voice and simple sentence structure\nInclude relevant examples and stories\nEnd with a satisfying conclusion\n\nFormatting Rules:\n\nAdd [short pause] after sentences (use sparingly, not every sentence)\nAdd [long pause] between paragraphs or major sections\nUse voice effects strategically: [whispers], [shouts], [excited], [serious], [sarcastic], [sings], [laughs]\nWrite numbers as words: \"twenty-three\" not \"23\"\nSpell out acronyms first time: \"AI, or artificial intelligence\"\nAvoid complex punctuation (em-dashes work, but semicolons don't read well)\nRemove markdown formatting before TTS conversion\nStep 4: Present the Script\n\nShow the script to the user and ask:\n\nHere's the [format] script I've created (approximately [length] minutes):\n\n[Display the script]\n\nWould you like me to:\n1. Generate the audio now\n2. Make changes to the script\n3. Adjust the length or tone\n\nStep 5: Handle User Feedback\n\nIf user requests changes:\n\nRegenerate the script with adjustments\nMaintain the target word count\nPresent the revised version\n\nIf user approves:\n\nProceed to audio generation\nStep 6: Generate Audio\n\nFormat the script for TTS:\n\nRemove any remaining markdown (headers, bold, italics)\nEnsure voice effects are in proper [effect] format\nCheck that pauses are appropriately placed\nVerify numbers and acronyms are spelled out\n\nInvoke the TTS script:\n\nIMPORTANT: The ELEVENLABS_API_KEY environment variable is already configured in the system. Simply invoke the TTS script directly.\n\nuv run /home/clawdbot/clawdbot/skills/sag/scripts/tts.py \\\n  -o /tmp/audio-gen-[timestamp]-[topic-slug].mp3 \\\n  -m eleven_multilingual_v2 \\\n  \"[formatted_script]\"\n\n\nFor long scripts, use heredoc:\n\nuv run /home/clawdbot/clawdbot/skills/sag/scripts/tts.py \\\n  -o /tmp/audio-gen-[timestamp]-[topic-slug].mp3 \\\n  -m eleven_multilingual_v2 \\\n  \"$(cat <<'EOF'\n[formatted_script]\nEOF\n)\"\n\n\nReturn the result:\n\nMEDIA:/tmp/audio-gen-[timestamp]-[topic-slug].mp3\n\nYour [format] is ready! [Brief description of content]. Duration: approximately [X] minutes.\n\nVoice Effects (SSML Tags)\n\nAvailable voice modulation effects (use sparingly for impact):\n\n[whispers] - Soft, intimate delivery\n[shouts] - Loud, emphatic delivery\n[excited] - Enthusiastic, energetic tone\n[serious] - Grave, solemn tone\n[sarcastic] - Ironic, mocking tone\n[sings] - Musical, melodic delivery\n[laughs] - Amused, jovial tone\n[short pause] - Brief silence (~0.5s)\n[long pause] - Extended silence (~1-2s)\n\nBest Practices:\n\nUse effects for emotional moments, not every sentence\nPauses are your most powerful tool for pacing\nVoice effects work best in audiobooks and dramatic content\nKeep podcasts and educational content mostly natural\nError Handling\nScript Too Long\n\nIf the generated script exceeds target by >20%:\n\nThe script I generated is [X] words ([Y] minutes), which is longer than your target of [Z] minutes. Would you like me to:\n1. Condense it to fit the target length\n2. Split it into multiple parts\n3. Keep it as is\n\nScript Too Short\n\nIf the generated script is under target by >20%:\n\nThe script is [X] words ([Y] minutes), shorter than your target. Would you like me to:\n1. Expand it with more detail\n2. Add additional examples or stories\n3. Generate as is\n\nTTS Generation Fails\n\nIf the TTS script fails:\n\nI've created the script, but I'm unable to generate the audio right now. Here's your script:\n\n[Display script]\n\nError: [specific error message]\n\nYou can:\n1. Check that ELEVENLABS_API_KEY is configured\n2. Use the script with your own text-to-speech tool\n3. Try again in a moment\n4. Ask me to troubleshoot the audio generation\n\n\nCommon TTS Issues:\n\nAPI key not set: Verify ELEVENLABS_API_KEY in config\nRate limit: Wait a moment and try again\nText too long: Break into smaller chunks (max ~5000 characters)\nInvalid Request\n\nFor unrealistic requests (e.g., \"100-hour audiobook\"):\n\nThat length would require [X] words and take significant time to generate. I recommend:\n- Breaking it into multiple episodes/chapters\n- Targeting 5-30 minutes per audio file\n- Creating a series instead of one long file\n\nTips for Best Results\nFor Engaging Audiobooks\nFocus on character emotions and sensory details\nUse pauses to build dramatic tension\nVary sentence length for rhythm\nInclude internal monologue and reflection\nFor Compelling Podcasts\nStart with a question or surprising fact\nUse conversational phrases: \"You know what's interesting...\"\nInclude relatable examples from everyday life\nEnd with actionable takeaways\nFor Effective Educational Content\nUse the \"explain like I'm five\" approach\nBuild from simple to complex concepts\nRepeat key terms and definitions\nProvide multiple examples for clarity\nTechnical Notes\n\nTTS Implementation:\n\nUses Python script: ~/.clawdbot/clawdbot/skills/sag/scripts/tts.py\nNo binary installation required (pure Python + requests)\nDirectly calls ElevenLabs API\nCompatible with Linux and macOS\n\nFile Storage:\n\nAudio files are saved to /tmp/audio-gen/\nFilename format: audio-gen-[timestamp]-[topic-slug].mp3\nFiles are automatically cleaned up after 24 hours\n\nAPI Requirements:\n\nAnthropic API for script generation (already configured)\nElevenLabs API for text-to-speech (configured via ELEVENLABS_API_KEY)\nBoth services must be configured and have available credits\n\nSupported Models:\n\neleven_multilingual_v2 - Best quality (default)\neleven_turbo_v2 - Faster generation\neleven_turbo_v2_5 - Fastest generation\neleven_multilingual_v1 - Legacy model\n\nCost Estimate:\n\n10-minute audio (~750 words): approximately $1.43\nClaude API: ~$0.075\nElevenLabs: ~$1.35\nLonger content scales proportionally\n\nGeneration Time:\n\nScript generation: 5-30 seconds (depending on length)\nAudio generation: 5-15 seconds (ElevenLabs processing)\nTotal: Usually under 1 minute for 10-minute audio\nLimitations\n\nMaximum Length: 30 minutes (~2,250 words) per audio file\n\nFor longer content, create multiple parts/episodes\n\nSingle Voice: Currently supports one narrator voice\n\nCannot do multi-voice dialogue or character voices\n\nNo Background Music: Pure voice narration only\n\nNo background music, sound effects, or audio mixing\n\nReal-time Generation: Each request generates fresh content\n\nNo pre-made templates or cached audio\n\nLanguage: Primarily English\n\nElevenLabs supports other languages, but content generation optimized for English\nExample Conversations\nExample 1: Quick Audiobook\nUser: Create a 5-minute audiobook chapter about a lonely robot learning to feel emotions\n\nAgent: I'll create a 5-minute sci-fi audiobook chapter. Let me write the story...\n\n[Generates ~400-word narrative with emotional arc]\n\nHere's the story I've created about Unit-7, a maintenance robot who discovers the meaning of friendship through an unexpected encounter. Would you like me to generate the audio or make changes first?\n\nUser: Generate the audio\n\nAgent: [Invokes SAG with formatted script]\n\nMEDIA:/tmp/audio-gen-1234567890-lonely-robot.mp3\n\nYour audiobook chapter is ready! The story follows Unit-7's emotional awakening through a touching encounter with a young engineer. Duration: approximately 5 minutes.\n\nExample 2: Podcast with Revisions\nUser: Make a 10-minute podcast about sustainable living tips\n\nAgent: [Generates podcast script with intro, 5 practical tips, outro]\n\nHere's a 10-minute podcast with 5 actionable sustainable living tips. The tone is encouraging and practical. Would you like me to:\n1. Generate audio now\n2. Make it more urgent/climate-focused\n3. Add more specific examples\n4. Adjust the length\n\nUser: Add more specific examples for each tip\n\nAgent: [Regenerates with detailed examples]\n\nBetter? I've added specific examples like calculating your carbon footprint, composting at home, and choosing sustainable brands. Ready for audio?\n\nUser: Perfect!\n\nAgent: MEDIA:/tmp/audio-gen-1234567890-sustainable-living.mp3\n\nYour sustainable living podcast is ready with 5 practical tips, each with concrete examples you can start using today! Duration: 10 minutes.\n\nExample 3: Educational Content\nUser: Create a 15-minute educational audio explaining quantum computing for beginners\n\nAgent: I'll create a beginner-friendly 15-minute explanation of quantum computing...\n\n[Generates ~1,100 words with analogies, examples, and clear structure]\n\nScript preview:\n- Introduction: What is quantum computing? (2 min)\n- Classical vs Quantum: The key differences (4 min)\n- Qubits and Superposition: How it works (5 min)\n- Real Applications: What it means for you (3 min)\n- Conclusion: The future of computing (1 min)\n\nShould I generate the audio?\n\nUser: Yes please\n\nAgent: MEDIA:/tmp/audio-gen-1234567890-quantum-computing.mp3\n\nYour educational podcast on quantum computing is ready! It uses everyday analogies to explain qubits, superposition, and real-world applications. Duration: approximately 15 minutes.\n\nWhen to Use This Skill\n\nUse the audio-gen skill when users request:\n\n\"Create an audiobook about...\"\n\"Make a podcast episode about...\"\n\"Generate audio content explaining...\"\n\"Record a story about...\"\n\"Produce an audio guide for...\"\n\"Create a narrated version of...\"\n\"Make an audio lesson about...\"\n\nDo NOT use for:\n\nTranscribing existing audio (use whisper skills instead)\nConverting user-written text to audio (use SAG skill directly)\nReal-time voice interaction (use voice-call plugin)\nMulti-speaker dialogues or interviews (current limitation)\nAdvanced Usage\nVoice Selection\n\nUsers can request specific voices:\n\nUser: Create a podcast with the Rachel voice\n\nAgent: [Checks voice configuration for \"Rachel\" voice ID and uses it in SAG command]\n\nSeries/Episodes\n\nFor multi-part content:\n\nUser: Create a 3-episode series about space exploration\n\nAgent: I'll create Episode 1 first. Each episode will be 10 minutes. Let's start with \"The Dawn of the Space Age\"...\n\n[After completion]\n\nEpisode 1 is ready! Would you like me to continue with Episode 2?\n\nFormat Blending\n\nMix formats for unique styles:\n\nUser: Create an educational podcast that tells a story\n\nAgent: [Generates content that combines storytelling narrative with educational explanations]\n\nTroubleshooting\n\nIssue: Audio sounds robotic or unnatural Solution: Add more pauses and voice effects. Use contractions and conversational language.\n\nIssue: Script doesn't match requested length Solution: Regenerate with explicit word count target. Check calculations (75 words/min).\n\nIssue: Content is too technical or too simple Solution: Ask user for target audience. Adjust complexity accordingly.\n\nIssue: SAG command fails Solution: Check ELEVENLABS_API_KEY is set. Verify SAG skill is installed and working.\n\nIssue: User wants to edit the script manually Solution: Provide the plain text script. User can modify it and paste back for audio generation.\n\n💡 Pro Tip: Always generate the script first and get user approval before creating audio. This saves time and API costs, and ensures the user gets exactly what they want."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/udiedrichsen/audio-gen",
    "publisherUrl": "https://clawhub.ai/udiedrichsen/audio-gen",
    "owner": "udiedrichsen",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/audio-gen",
    "downloadUrl": "https://openagent3.xyz/downloads/audio-gen",
    "agentUrl": "https://openagent3.xyz/skills/audio-gen/agent",
    "manifestUrl": "https://openagent3.xyz/skills/audio-gen/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/audio-gen/agent.md"
  }
}