{
  "schemaVersion": "1.0",
  "item": {
    "slug": "google-gemini-media",
    "name": "Google Gemini Media",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/Xsir0/google-gemini-media",
    "canonicalUrl": "https://clawhub.ai/Xsir0/google-gemini-media",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/google-gemini-media",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=google-gemini-media",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/google-gemini-media"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/google-gemini-media",
    "agentPageUrl": "https://openagent3.xyz/skills/google-gemini-media/agent",
    "manifestUrl": "https://openagent3.xyz/skills/google-gemini-media/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/google-gemini-media/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "1. Goals and scope",
        "body": "This Skill consolidates six Gemini API capabilities into reusable workflows and implementation templates:\n\nImage generation (Nano Banana: text-to-image, image editing, multi-turn iteration)\nImage understanding (caption/VQA/classification/comparison, multi-image prompts; supports inline and Files API)\nVideo generation (Veo 3.1: text-to-video, aspect ratio/resolution control, reference-image guidance, first/last frames, video extension, native audio)\nVideo understanding (upload/inline/YouTube URL; summaries, Q&A, timestamped evidence)\nSpeech generation (Gemini native TTS: single-speaker and multi-speaker; controllable style/accent/pace/tone)\nAudio understanding (upload/inline; description, transcription, time-range transcription, token counting)\n\nConvention: This Skill follows the official Google Gen AI SDK (Node.js/REST) as the main line; currently only Node.js/REST examples are provided. If your project already wraps other languages or frameworks, map this Skill's request structure, model selection, and I/O spec to your wrapper layer."
      },
      {
        "title": "2. Quick routing (decide which capability to use)",
        "body": "Do you need to produce images?\n\nNeed to generate images from scratch or edit based on an image -> use Nano Banana image generation (see Section 5)\n\nDo you need to understand images?\n\nNeed recognition, description, Q&A, comparison, or info extraction -> use Image understanding (see Section 6)\n\nDo you need to produce video?\n\nNeed to generate an 8-second video (optionally with native audio) -> use Veo 3.1 video generation (see Section 7)\n\nDo you need to understand video?\n\nNeed summaries/Q&A/segment extraction with timestamps -> use Video understanding (see Section 8)\n\nDo you need to read text aloud?\n\nNeed controllable narration, podcast/audiobook style, etc. -> use Speech generation (TTS) (see Section 9)\n\nDo you need to understand audio?\n\nNeed audio descriptions, transcription, time-range transcription, token counting -> use Audio understanding (see Section 10)"
      },
      {
        "title": "3.0 Prerequisites (dependencies and tools)",
        "body": "Node.js 18+ (match your project version)\nInstall SDK (example):\n\nnpm install @google/genai\n\nREST examples only need curl; if you need to parse image Base64, install jq (optional)."
      },
      {
        "title": "3.1 Authentication and environment variables",
        "body": "Put your API key in GEMINI_API_KEY\nREST requests use x-goog-api-key: $GEMINI_API_KEY"
      },
      {
        "title": "3.2 Two file input modes: Inline vs Files API",
        "body": "Inline (embedded bytes/Base64)\n\nPros: shorter call chain, good for small files.\nKey constraint: total request size (text prompt + system instructions + embedded bytes) typically has a ~20MB ceiling.\n\nFiles API (upload then reference)\n\nPros: good for large files, reusing the same file, or multi-turn conversations.\nTypical flow:\n\nfiles.upload(...) (SDK) or POST /upload/v1beta/files (REST resumable)\nUse file_data / file_uri in generateContent\n\nEngineering suggestion: implement ensure_file_uri() so that when a file exceeds a threshold (for example 10-15MB warning) or is reused, you automatically route through the Files API."
      },
      {
        "title": "3.3 Unified handling of binary media outputs",
        "body": "Images: usually returned as inline_data (Base64) in response parts; in the SDK use part.as_image() or decode Base64 and save as PNG/JPG.\nSpeech (TTS): usually returns PCM bytes (Base64); save as .pcm or wrap into .wav (commonly 24kHz, 16-bit, mono).\nVideo (Veo): long-running async task; poll the operation; download the file (or use the returned URI)."
      },
      {
        "title": "4. Model selection matrix (choose by scenario)",
        "body": "Important: model names, versions, limits, and quotas can change over time. Verify against official docs before use. Last updated: 2026-01-22."
      },
      {
        "title": "4.1 Image generation (Nano Banana)",
        "body": "gemini-2.5-flash-image: optimized for speed/throughput; good for frequent, low-latency generation/editing.\ngemini-3-pro-image-preview: stronger instruction following and high-fidelity text rendering; better for professional assets and complex edits."
      },
      {
        "title": "4.2 General image/video/audio understanding",
        "body": "Docs use gemini-3-flash-preview for image, video, and audio understanding (choose stronger models as needed for quality/cost)."
      },
      {
        "title": "4.3 Video generation (Veo)",
        "body": "Example model: veo-3.1-generate-preview (generates 8-second video and can natively generate audio)."
      },
      {
        "title": "4.4 Speech generation (TTS)",
        "body": "Example model: gemini-2.5-flash-preview-tts (native TTS, currently in preview)."
      },
      {
        "title": "5.1 Text-to-Image",
        "body": "SDK (Node.js) minimal template\n\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-image\",\n  contents:\n    \"Create a picture of a nano banana dish in a fancy restaurant with a Gemini theme\",\n});\n\nconst parts = response.candidates?.[0]?.content?.parts ?? [];\nfor (const part of parts) {\n  if (part.text) console.log(part.text);\n  if (part.inlineData?.data) {\n    fs.writeFileSync(\"out.png\", Buffer.from(part.inlineData.data, \"base64\"));\n  }\n}\n\nREST (with imageConfig) minimal template\n\ncurl -s -X POST   \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent\"   -H \"x-goog-api-key: $GEMINI_API_KEY\"   -H \"Content-Type: application/json\"   -d '{\n    \"contents\":[{\"parts\":[{\"text\":\"Create a picture of a nano banana dish in a fancy restaurant with a Gemini theme\"}]}],\n    \"generationConfig\": {\"imageConfig\": {\"aspectRatio\":\"16:9\"}}\n  }'\n\nREST image parsing (Base64 decode)\n\ncurl -s -X POST \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent\" \\\n  -H \"x-goog-api-key: $GEMINI_API_KEY\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"contents\":[{\"parts\":[{\"text\":\"A minimal studio product shot of a nano banana\"}]}]}' \\\n  | jq -r '.candidates[0].content.parts[] | select(.inline_data) | .inline_data.data' \\\n  | base64 --decode > out.png\n\n# macOS can use: base64 -D > out.png"
      },
      {
        "title": "5.2 Text-and-Image-to-Image",
        "body": "Use case: given an image, add/remove/modify elements, change style, color grading, etc.\n\nSDK (Node.js) minimal template\n\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst prompt =\n  \"Add a nano banana on the table, keep lighting consistent, cinematic tone.\";\nconst imageBase64 = fs.readFileSync(\"input.png\").toString(\"base64\");\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-image\",\n  contents: [\n    { text: prompt },\n    { inlineData: { mimeType: \"image/png\", data: imageBase64 } },\n  ],\n});\n\nconst parts = response.candidates?.[0]?.content?.parts ?? [];\nfor (const part of parts) {\n  if (part.inlineData?.data) {\n    fs.writeFileSync(\"edited.png\", Buffer.from(part.inlineData.data, \"base64\"));\n  }\n}"
      },
      {
        "title": "5.3 Multi-turn image iteration (Multi-turn editing)",
        "body": "Best practice: use chat for continuous iteration (for example: generate first, then \"only edit a specific region/element\", then \"make variants in the same style\").\nTo output mixed \"text + image\" results, set response_modalities to [\"TEXT\", \"IMAGE\"]."
      },
      {
        "title": "5.4 ImageConfig",
        "body": "You can set in generationConfig.imageConfig or the SDK config:\n\naspectRatio: e.g. 16:9, 1:1.\nimageSize: e.g. 2K, 4K (higher resolution is usually slower/more expensive and model support can vary)."
      },
      {
        "title": "6.1 Two ways to provide input images",
        "body": "Inline image data: suitable for small files (total request size < 20MB).\nFiles API upload: better for large files or reuse across multiple requests."
      },
      {
        "title": "6.2 Inline images (Node.js) minimal template",
        "body": "import { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst imageBase64 = fs.readFileSync(\"image.jpg\").toString(\"base64\");\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: [\n    { inlineData: { mimeType: \"image/jpeg\", data: imageBase64 } },\n    { text: \"Caption this image, and list any visible brands.\" },\n  ],\n});\n\nconsole.log(response.text);"
      },
      {
        "title": "6.3 Upload and reference with Files API (Node.js) minimal template",
        "body": "import { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"image.jpg\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n    \"Caption this image.\",\n  ]),\n});\n\nconsole.log(response.text);"
      },
      {
        "title": "6.4 Multi-image prompts",
        "body": "Append multiple images as multiple Part entries in the same contents; you can mix uploaded references and inline bytes."
      },
      {
        "title": "7.1 Core features (must know)",
        "body": "Generates 8-second high-fidelity video, optionally 720p / 1080p / 4k, and supports native audio generation (dialogue, ambience, SFX).\nSupports:\n\nAspect ratio (16:9 / 9:16)\nVideo extension (extend a generated video; typically limited to 720p)\nFirst/last frame control (frame-specific)\nUp to 3 reference images (image-based direction)"
      },
      {
        "title": "7.2 SDK (Node.js) minimal template: async polling + download",
        "body": "import { GoogleGenAI } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst prompt =\n  \"A cinematic shot of a cat astronaut walking on the moon. Include subtle wind ambience.\";\nlet operation = await ai.models.generateVideos({\n  model: \"veo-3.1-generate-preview\",\n  prompt,\n  config: { resolution: \"1080p\" },\n});\n\nwhile (!operation.done) {\n  await new Promise((resolve) => setTimeout(resolve, 10_000));\n  operation = await ai.operations.getVideosOperation({ operation });\n}\n\nconst video = operation.response?.generatedVideos?.[0]?.video;\nif (!video) throw new Error(\"No video returned\");\nawait ai.files.download({ file: video, downloadPath: \"out.mp4\" });"
      },
      {
        "title": "7.3 REST minimal template: predictLongRunning + poll + download",
        "body": "Key point: Veo REST uses :predictLongRunning to return an operation name, then poll GET /v1beta/{operation_name}; once done, download from the video URI in the response."
      },
      {
        "title": "7.4 Common controls (recommend a unified wrapper)",
        "body": "aspectRatio: \"16:9\" or \"9:16\"\nresolution: \"720p\" | \"1080p\" | \"4k\" (higher resolutions are usually slower/more expensive)\nWhen writing prompts: put dialogue in quotes; explicitly call out SFX and ambience; use cinematography language (camera position, movement, composition, lens effects, mood).\nNegative constraints: if the API supports a negative prompt field, use it; otherwise list elements you do not want to see."
      },
      {
        "title": "7.5 Important limits (engineering fallback needed)",
        "body": "Latency can vary from seconds to minutes; implement timeouts and retries.\nGenerated videos are only retained on the server for a limited time (download promptly).\nOutputs include a SynthID watermark.\n\nPolling fallback (with timeout/backoff) pseudocode\n\nconst deadline = Date.now() + 300_000; // 5 min\nlet sleepMs = 2000;\nwhile (!operation.done && Date.now() < deadline) {\n  await new Promise((resolve) => setTimeout(resolve, sleepMs));\n  sleepMs = Math.min(Math.floor(sleepMs * 1.5), 15_000);\n  operation = await ai.operations.getVideosOperation({ operation });\n}\nif (!operation.done) throw new Error(\"video generation timed out\");"
      },
      {
        "title": "8.1 Video input options",
        "body": "Files API upload: recommended when file > 100MB, video length > ~1 minute, or you need reuse.\nInline video data: for smaller files.\nDirect YouTube URL: can analyze public videos."
      },
      {
        "title": "8.2 Files API (Node.js) minimal template",
        "body": "import { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"sample.mp4\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n    \"Summarize this video. Provide timestamps for key events.\",\n  ]),\n});\n\nconsole.log(response.text);"
      },
      {
        "title": "8.3 Timestamp prompting strategy",
        "body": "Ask for segmented bullets with \"(mm:ss)\" timestamps.\nRequire \"evidence with specific time ranges\" and include downstream structured extraction (JSON) in the same prompt if needed."
      },
      {
        "title": "9.1 Positioning",
        "body": "Native TTS: for \"precise reading + controllable style\" (podcasts, audiobooks, ad voiceover, etc.).\nDistinguish from the Live API: Live API is more interactive and non-structured audio/multimodal conversation; TTS is focused on controlled narration."
      },
      {
        "title": "9.2 Single-speaker TTS (Node.js) minimal template",
        "body": "import { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-preview-tts\",\n  contents: [{ parts: [{ text: \"Say cheerfully: Have a wonderful day!\" }] }],\n  config: {\n    responseModalities: [\"AUDIO\"],\n    speechConfig: {\n      voiceConfig: {\n        prebuiltVoiceConfig: { voiceName: \"Kore\" },\n      },\n    },\n  },\n});\n\nconst data =\n  response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data ?? \"\";\nif (!data) throw new Error(\"No audio returned\");\nfs.writeFileSync(\"out.pcm\", Buffer.from(data, \"base64\"));"
      },
      {
        "title": "9.3 Multi-speaker TTS (max 2 speakers)",
        "body": "Requirements:\n\nUse multiSpeakerVoiceConfig\nEach speaker name must match the dialogue labels in the prompt (e.g., Joe/Jane)."
      },
      {
        "title": "9.4 Voice options and language",
        "body": "voice_name supports 30 prebuilt voices (for example Zephyr, Puck, Charon, Kore, etc.).\nThe model can auto-detect input language and supports 24 languages (see docs for the list)."
      },
      {
        "title": "9.5 \"Director notes\" (strongly recommended for high-quality voice)",
        "body": "Provide controllable directions for style, pace, accent, etc., but avoid over-constraining."
      },
      {
        "title": "10.1 Typical tasks",
        "body": "Describe audio content (including non-speech like birds, alarms, etc.)\nGenerate transcripts\nTranscribe specific time ranges\nCount tokens (for cost estimates/segmentation)"
      },
      {
        "title": "10.2 Files API (Node.js) minimal template",
        "body": "import { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"sample.mp3\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    \"Describe this audio clip.\",\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n  ]),\n});\n\nconsole.log(response.text);"
      },
      {
        "title": "10.3 Key limits and engineering tips",
        "body": "Supports common formats: WAV/MP3/AIFF/AAC/OGG/FLAC.\nAudio tokenization: about 32 tokens/second (about 1920 tokens per minute; values may change).\nTotal audio length per prompt is capped at 9.5 hours; multi-channel audio is downmixed; audio is resampled (see docs for exact parameters).\nIf total request size exceeds 20MB, you must use the Files API."
      },
      {
        "title": "Example A: Image generation -> validation via understanding",
        "body": "Generate product images with Nano Banana (require negative space, consistent lighting).\nUse image understanding for self-check: verify text clarity, brand spelling, and unsafe elements.\nIf not satisfied, feed the generated image into text+image editing and iterate."
      },
      {
        "title": "Example B: Video generation -> video understanding -> narration script",
        "body": "Generate an 8-second shot with Veo (include dialogue or SFX).\nDownload and save (respect retention window).\nUpload video to video understanding to produce a storyboard + timestamps + narration copy (then feed to TTS)."
      },
      {
        "title": "Example C: Audio understanding -> time-range transcription -> TTS redub",
        "body": "Upload meeting audio and transcribe full content.\nTranscribe or summarize specific time ranges.\nUse TTS to generate a \"broadcast\" version of the summary."
      },
      {
        "title": "12. Compliance and risk (must follow)",
        "body": "Ensure you have the necessary rights to upload images/video/audio; do not generate infringing, deceptive, harassing, or harmful content.\nGenerated images and videos include SynthID watermarking; videos may also have regional/person-based generation constraints.\nProduction systems must implement timeouts, retries, failure fallbacks, and human review/post-processing for generated content."
      },
      {
        "title": "13. Quick reference (Checklist)",
        "body": "Pick the right model: image generation (Flash Image / Pro Image Preview), video generation (Veo 3.1), TTS (Gemini 2.5 TTS), understanding (Gemini Flash/Pro).\n Pick the right input mode: inline for small files; Files API for large/reuse.\n Parse binary outputs correctly: image/audio via inline_data decode; video via operation polling + download.\n For video generation: set aspectRatio / resolution, and download promptly (avoid expiration).\n For TTS: set response_modalities=[\"AUDIO\"]; max 2 speakers; speaker names must match prompt.\n For audio understanding: countTokens when needed; segment long audio or use Files API."
      }
    ],
    "body": "Gemini Multimodal Media (Image/Video/Speech) Skill\n1. Goals and scope\n\nThis Skill consolidates six Gemini API capabilities into reusable workflows and implementation templates:\n\nImage generation (Nano Banana: text-to-image, image editing, multi-turn iteration)\nImage understanding (caption/VQA/classification/comparison, multi-image prompts; supports inline and Files API)\nVideo generation (Veo 3.1: text-to-video, aspect ratio/resolution control, reference-image guidance, first/last frames, video extension, native audio)\nVideo understanding (upload/inline/YouTube URL; summaries, Q&A, timestamped evidence)\nSpeech generation (Gemini native TTS: single-speaker and multi-speaker; controllable style/accent/pace/tone)\nAudio understanding (upload/inline; description, transcription, time-range transcription, token counting)\n\nConvention: This Skill follows the official Google Gen AI SDK (Node.js/REST) as the main line; currently only Node.js/REST examples are provided. If your project already wraps other languages or frameworks, map this Skill's request structure, model selection, and I/O spec to your wrapper layer.\n\n2. Quick routing (decide which capability to use)\nDo you need to produce images?\nNeed to generate images from scratch or edit based on an image -> use Nano Banana image generation (see Section 5)\nDo you need to understand images?\nNeed recognition, description, Q&A, comparison, or info extraction -> use Image understanding (see Section 6)\nDo you need to produce video?\nNeed to generate an 8-second video (optionally with native audio) -> use Veo 3.1 video generation (see Section 7)\nDo you need to understand video?\nNeed summaries/Q&A/segment extraction with timestamps -> use Video understanding (see Section 8)\nDo you need to read text aloud?\nNeed controllable narration, podcast/audiobook style, etc. -> use Speech generation (TTS) (see Section 9)\nDo you need to understand audio?\nNeed audio descriptions, transcription, time-range transcription, token counting -> use Audio understanding (see Section 10)\n3. Unified engineering constraints and I/O spec (must read)\n3.0 Prerequisites (dependencies and tools)\nNode.js 18+ (match your project version)\nInstall SDK (example):\nnpm install @google/genai\n\nREST examples only need curl; if you need to parse image Base64, install jq (optional).\n3.1 Authentication and environment variables\nPut your API key in GEMINI_API_KEY\nREST requests use x-goog-api-key: $GEMINI_API_KEY\n3.2 Two file input modes: Inline vs Files API\n\nInline (embedded bytes/Base64)\n\nPros: shorter call chain, good for small files.\nKey constraint: total request size (text prompt + system instructions + embedded bytes) typically has a ~20MB ceiling.\n\nFiles API (upload then reference)\n\nPros: good for large files, reusing the same file, or multi-turn conversations.\nTypical flow:\nfiles.upload(...) (SDK) or POST /upload/v1beta/files (REST resumable)\nUse file_data / file_uri in generateContent\n\nEngineering suggestion: implement ensure_file_uri() so that when a file exceeds a threshold (for example 10-15MB warning) or is reused, you automatically route through the Files API.\n\n3.3 Unified handling of binary media outputs\nImages: usually returned as inline_data (Base64) in response parts; in the SDK use part.as_image() or decode Base64 and save as PNG/JPG.\nSpeech (TTS): usually returns PCM bytes (Base64); save as .pcm or wrap into .wav (commonly 24kHz, 16-bit, mono).\nVideo (Veo): long-running async task; poll the operation; download the file (or use the returned URI).\n4. Model selection matrix (choose by scenario)\n\nImportant: model names, versions, limits, and quotas can change over time. Verify against official docs before use. Last updated: 2026-01-22.\n\n4.1 Image generation (Nano Banana)\ngemini-2.5-flash-image: optimized for speed/throughput; good for frequent, low-latency generation/editing.\ngemini-3-pro-image-preview: stronger instruction following and high-fidelity text rendering; better for professional assets and complex edits.\n4.2 General image/video/audio understanding\nDocs use gemini-3-flash-preview for image, video, and audio understanding (choose stronger models as needed for quality/cost).\n4.3 Video generation (Veo)\nExample model: veo-3.1-generate-preview (generates 8-second video and can natively generate audio).\n4.4 Speech generation (TTS)\nExample model: gemini-2.5-flash-preview-tts (native TTS, currently in preview).\n5. Image generation (Nano Banana)\n5.1 Text-to-Image\n\nSDK (Node.js) minimal template\n\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-image\",\n  contents:\n    \"Create a picture of a nano banana dish in a fancy restaurant with a Gemini theme\",\n});\n\nconst parts = response.candidates?.[0]?.content?.parts ?? [];\nfor (const part of parts) {\n  if (part.text) console.log(part.text);\n  if (part.inlineData?.data) {\n    fs.writeFileSync(\"out.png\", Buffer.from(part.inlineData.data, \"base64\"));\n  }\n}\n\n\nREST (with imageConfig) minimal template\n\ncurl -s -X POST   \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent\"   -H \"x-goog-api-key: $GEMINI_API_KEY\"   -H \"Content-Type: application/json\"   -d '{\n    \"contents\":[{\"parts\":[{\"text\":\"Create a picture of a nano banana dish in a fancy restaurant with a Gemini theme\"}]}],\n    \"generationConfig\": {\"imageConfig\": {\"aspectRatio\":\"16:9\"}}\n  }'\n\n\nREST image parsing (Base64 decode)\n\ncurl -s -X POST \"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent\" \\\n  -H \"x-goog-api-key: $GEMINI_API_KEY\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"contents\":[{\"parts\":[{\"text\":\"A minimal studio product shot of a nano banana\"}]}]}' \\\n  | jq -r '.candidates[0].content.parts[] | select(.inline_data) | .inline_data.data' \\\n  | base64 --decode > out.png\n\n# macOS can use: base64 -D > out.png\n\n5.2 Text-and-Image-to-Image\n\nUse case: given an image, add/remove/modify elements, change style, color grading, etc.\n\nSDK (Node.js) minimal template\n\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst prompt =\n  \"Add a nano banana on the table, keep lighting consistent, cinematic tone.\";\nconst imageBase64 = fs.readFileSync(\"input.png\").toString(\"base64\");\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-image\",\n  contents: [\n    { text: prompt },\n    { inlineData: { mimeType: \"image/png\", data: imageBase64 } },\n  ],\n});\n\nconst parts = response.candidates?.[0]?.content?.parts ?? [];\nfor (const part of parts) {\n  if (part.inlineData?.data) {\n    fs.writeFileSync(\"edited.png\", Buffer.from(part.inlineData.data, \"base64\"));\n  }\n}\n\n5.3 Multi-turn image iteration (Multi-turn editing)\n\nBest practice: use chat for continuous iteration (for example: generate first, then \"only edit a specific region/element\", then \"make variants in the same style\").\nTo output mixed \"text + image\" results, set response_modalities to [\"TEXT\", \"IMAGE\"].\n\n5.4 ImageConfig\n\nYou can set in generationConfig.imageConfig or the SDK config:\n\naspectRatio: e.g. 16:9, 1:1.\nimageSize: e.g. 2K, 4K (higher resolution is usually slower/more expensive and model support can vary).\n6. Image understanding (Image Understanding)\n6.1 Two ways to provide input images\nInline image data: suitable for small files (total request size < 20MB).\nFiles API upload: better for large files or reuse across multiple requests.\n6.2 Inline images (Node.js) minimal template\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst imageBase64 = fs.readFileSync(\"image.jpg\").toString(\"base64\");\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: [\n    { inlineData: { mimeType: \"image/jpeg\", data: imageBase64 } },\n    { text: \"Caption this image, and list any visible brands.\" },\n  ],\n});\n\nconsole.log(response.text);\n\n6.3 Upload and reference with Files API (Node.js) minimal template\nimport { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"image.jpg\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n    \"Caption this image.\",\n  ]),\n});\n\nconsole.log(response.text);\n\n6.4 Multi-image prompts\n\nAppend multiple images as multiple Part entries in the same contents; you can mix uploaded references and inline bytes.\n\n7. Video generation (Veo 3.1)\n7.1 Core features (must know)\nGenerates 8-second high-fidelity video, optionally 720p / 1080p / 4k, and supports native audio generation (dialogue, ambience, SFX).\nSupports:\nAspect ratio (16:9 / 9:16)\nVideo extension (extend a generated video; typically limited to 720p)\nFirst/last frame control (frame-specific)\nUp to 3 reference images (image-based direction)\n7.2 SDK (Node.js) minimal template: async polling + download\nimport { GoogleGenAI } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst prompt =\n  \"A cinematic shot of a cat astronaut walking on the moon. Include subtle wind ambience.\";\nlet operation = await ai.models.generateVideos({\n  model: \"veo-3.1-generate-preview\",\n  prompt,\n  config: { resolution: \"1080p\" },\n});\n\nwhile (!operation.done) {\n  await new Promise((resolve) => setTimeout(resolve, 10_000));\n  operation = await ai.operations.getVideosOperation({ operation });\n}\n\nconst video = operation.response?.generatedVideos?.[0]?.video;\nif (!video) throw new Error(\"No video returned\");\nawait ai.files.download({ file: video, downloadPath: \"out.mp4\" });\n\n7.3 REST minimal template: predictLongRunning + poll + download\n\nKey point: Veo REST uses :predictLongRunning to return an operation name, then poll GET /v1beta/{operation_name}; once done, download from the video URI in the response.\n\n7.4 Common controls (recommend a unified wrapper)\naspectRatio: \"16:9\" or \"9:16\"\nresolution: \"720p\" | \"1080p\" | \"4k\" (higher resolutions are usually slower/more expensive)\nWhen writing prompts: put dialogue in quotes; explicitly call out SFX and ambience; use cinematography language (camera position, movement, composition, lens effects, mood).\nNegative constraints: if the API supports a negative prompt field, use it; otherwise list elements you do not want to see.\n7.5 Important limits (engineering fallback needed)\nLatency can vary from seconds to minutes; implement timeouts and retries.\nGenerated videos are only retained on the server for a limited time (download promptly).\nOutputs include a SynthID watermark.\n\nPolling fallback (with timeout/backoff) pseudocode\n\nconst deadline = Date.now() + 300_000; // 5 min\nlet sleepMs = 2000;\nwhile (!operation.done && Date.now() < deadline) {\n  await new Promise((resolve) => setTimeout(resolve, sleepMs));\n  sleepMs = Math.min(Math.floor(sleepMs * 1.5), 15_000);\n  operation = await ai.operations.getVideosOperation({ operation });\n}\nif (!operation.done) throw new Error(\"video generation timed out\");\n\n8. Video understanding (Video Understanding)\n8.1 Video input options\nFiles API upload: recommended when file > 100MB, video length > ~1 minute, or you need reuse.\nInline video data: for smaller files.\nDirect YouTube URL: can analyze public videos.\n8.2 Files API (Node.js) minimal template\nimport { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"sample.mp4\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n    \"Summarize this video. Provide timestamps for key events.\",\n  ]),\n});\n\nconsole.log(response.text);\n\n8.3 Timestamp prompting strategy\nAsk for segmented bullets with \"(mm:ss)\" timestamps.\nRequire \"evidence with specific time ranges\" and include downstream structured extraction (JSON) in the same prompt if needed.\n9. Speech generation (Text-to-Speech, TTS)\n9.1 Positioning\nNative TTS: for \"precise reading + controllable style\" (podcasts, audiobooks, ad voiceover, etc.).\nDistinguish from the Live API: Live API is more interactive and non-structured audio/multimodal conversation; TTS is focused on controlled narration.\n9.2 Single-speaker TTS (Node.js) minimal template\nimport { GoogleGenAI } from \"@google/genai\";\nimport * as fs from \"node:fs\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-2.5-flash-preview-tts\",\n  contents: [{ parts: [{ text: \"Say cheerfully: Have a wonderful day!\" }] }],\n  config: {\n    responseModalities: [\"AUDIO\"],\n    speechConfig: {\n      voiceConfig: {\n        prebuiltVoiceConfig: { voiceName: \"Kore\" },\n      },\n    },\n  },\n});\n\nconst data =\n  response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data ?? \"\";\nif (!data) throw new Error(\"No audio returned\");\nfs.writeFileSync(\"out.pcm\", Buffer.from(data, \"base64\"));\n\n9.3 Multi-speaker TTS (max 2 speakers)\n\nRequirements:\n\nUse multiSpeakerVoiceConfig\nEach speaker name must match the dialogue labels in the prompt (e.g., Joe/Jane).\n9.4 Voice options and language\nvoice_name supports 30 prebuilt voices (for example Zephyr, Puck, Charon, Kore, etc.).\nThe model can auto-detect input language and supports 24 languages (see docs for the list).\n9.5 \"Director notes\" (strongly recommended for high-quality voice)\n\nProvide controllable directions for style, pace, accent, etc., but avoid over-constraining.\n\n10. Audio understanding (Audio Understanding)\n10.1 Typical tasks\nDescribe audio content (including non-speech like birds, alarms, etc.)\nGenerate transcripts\nTranscribe specific time ranges\nCount tokens (for cost estimates/segmentation)\n10.2 Files API (Node.js) minimal template\nimport { GoogleGenAI, createPartFromUri, createUserContent } from \"@google/genai\";\n\nconst ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });\nconst uploaded = await ai.files.upload({ file: \"sample.mp3\" });\n\nconst response = await ai.models.generateContent({\n  model: \"gemini-3-flash-preview\",\n  contents: createUserContent([\n    \"Describe this audio clip.\",\n    createPartFromUri(uploaded.uri, uploaded.mimeType),\n  ]),\n});\n\nconsole.log(response.text);\n\n10.3 Key limits and engineering tips\nSupports common formats: WAV/MP3/AIFF/AAC/OGG/FLAC.\nAudio tokenization: about 32 tokens/second (about 1920 tokens per minute; values may change).\nTotal audio length per prompt is capped at 9.5 hours; multi-channel audio is downmixed; audio is resampled (see docs for exact parameters).\nIf total request size exceeds 20MB, you must use the Files API.\n11. End-to-end examples (composition)\nExample A: Image generation -> validation via understanding\nGenerate product images with Nano Banana (require negative space, consistent lighting).\nUse image understanding for self-check: verify text clarity, brand spelling, and unsafe elements.\nIf not satisfied, feed the generated image into text+image editing and iterate.\nExample B: Video generation -> video understanding -> narration script\nGenerate an 8-second shot with Veo (include dialogue or SFX).\nDownload and save (respect retention window).\nUpload video to video understanding to produce a storyboard + timestamps + narration copy (then feed to TTS).\nExample C: Audio understanding -> time-range transcription -> TTS redub\nUpload meeting audio and transcribe full content.\nTranscribe or summarize specific time ranges.\nUse TTS to generate a \"broadcast\" version of the summary.\n12. Compliance and risk (must follow)\nEnsure you have the necessary rights to upload images/video/audio; do not generate infringing, deceptive, harassing, or harmful content.\nGenerated images and videos include SynthID watermarking; videos may also have regional/person-based generation constraints.\nProduction systems must implement timeouts, retries, failure fallbacks, and human review/post-processing for generated content.\n13. Quick reference (Checklist)\n Pick the right model: image generation (Flash Image / Pro Image Preview), video generation (Veo 3.1), TTS (Gemini 2.5 TTS), understanding (Gemini Flash/Pro).\n Pick the right input mode: inline for small files; Files API for large/reuse.\n Parse binary outputs correctly: image/audio via inline_data decode; video via operation polling + download.\n For video generation: set aspectRatio / resolution, and download promptly (avoid expiration).\n For TTS: set response_modalities=[\"AUDIO\"]; max 2 speakers; speaker names must match prompt.\n For audio understanding: countTokens when needed; segment long audio or use Files API."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/Xsir0/google-gemini-media",
    "publisherUrl": "https://clawhub.ai/Xsir0/google-gemini-media",
    "owner": "Xsir0",
    "version": "1.0.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/google-gemini-media",
    "downloadUrl": "https://openagent3.xyz/downloads/google-gemini-media",
    "agentUrl": "https://openagent3.xyz/skills/google-gemini-media/agent",
    "manifestUrl": "https://openagent3.xyz/skills/google-gemini-media/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/google-gemini-media/agent.md"
  }
}