{
  "schemaVersion": "1.0",
  "item": {
    "slug": "extract-pdf-text",
    "name": "Extract PDF Text",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/ivangdavila/extract-pdf-text",
    "canonicalUrl": "https://clawhub.ai/ivangdavila/extract-pdf-text",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/extract-pdf-text",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=extract-pdf-text",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "examples.md",
      "ocr.md",
      "troubleshooting.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/extract-pdf-text"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/extract-pdf-text",
    "agentPageUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent",
    "manifestUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "When to Use",
        "body": "Agent needs to extract text from PDFs. Use PyMuPDF (fitz) for fast local extraction. Works with text-based documents, scanned pages with OCR, forms, and complex layouts."
      },
      {
        "title": "Quick Reference",
        "body": "TopicFileCode examplesexamples.mdOCR setupocr.mdTroubleshootingtroubleshooting.md"
      },
      {
        "title": "1. Install PyMuPDF First",
        "body": "pip install PyMuPDF\n\nImport as fitz (historical name):\n\nimport fitz  # PyMuPDF"
      },
      {
        "title": "2. Basic Text Extraction",
        "body": "import fitz\n\ndoc = fitz.open(\"document.pdf\")\ntext = \"\"\nfor page in doc:\n    text += page.get_text()\ndoc.close()"
      },
      {
        "title": "3. Pick the Right Method",
        "body": "PDF TypeMethodText-basedpage.get_text() — fast, accurateScannedOCR with pytesseract — slowerMixedCheck each page, use OCR when needed"
      },
      {
        "title": "4. Check for Text Before OCR",
        "body": "def needs_ocr(page):\n    text = page.get_text().strip()\n    return len(text) < 50  # Likely scanned if very little text"
      },
      {
        "title": "5. Handle Errors Gracefully",
        "body": "try:\n    doc = fitz.open(path)\nexcept fitz.FileDataError:\n    print(\"Invalid or corrupted PDF\")\nexcept fitz.PasswordError:\n    doc = fitz.open(path, password=\"secret\")"
      },
      {
        "title": "Extraction Traps",
        "body": "TrapWhat HappensFixOCR on text PDFSlow + worse accuracyCheck get_text() firstForget to close docMemory leakUse with or doc.close()Assume page orderWrong reading flowUse sort=True in get_text()Ignore encodingGarbled charactersPyMuPDF handles UTF-8"
      },
      {
        "title": "Scope",
        "body": "This skill provides instructions for using PyMuPDF to extract PDF text.\n\nThis skill ONLY:\n\nGives code examples for PyMuPDF\nExplains OCR setup when needed\nTroubleshoots common issues\n\nThis skill NEVER:\n\nAccesses files without user request\nSends data externally\nModifies original PDFs"
      },
      {
        "title": "Security & Privacy",
        "body": "All processing is local:\n\nPyMuPDF runs entirely on your machine\nNo external API calls\nNo data leaves your system"
      },
      {
        "title": "Plain Text",
        "body": "text = page.get_text()"
      },
      {
        "title": "Structured (dict)",
        "body": "blocks = page.get_text(\"dict\")[\"blocks\"]\nfor b in blocks:\n    if b[\"type\"] == 0:  # text block\n        for line in b[\"lines\"]:\n            for span in line[\"spans\"]:\n                print(span[\"text\"], span[\"size\"])"
      },
      {
        "title": "JSON",
        "body": "import json\ndata = page.get_text(\"json\")\nparsed = json.loads(data)"
      },
      {
        "title": "Full Example",
        "body": "import fitz\n\ndef extract_pdf(path):\n    \"\"\"Extract text from PDF, with OCR fallback for scanned pages.\"\"\"\n    doc = fitz.open(path)\n    results = []\n    \n    for i, page in enumerate(doc):\n        text = page.get_text()\n        method = \"text\"\n        \n        # If very little text, might be scanned\n        if len(text.strip()) < 50:\n            # OCR would go here (see ocr.md)\n            method = \"needs_ocr\"\n        \n        results.append({\n            \"page\": i + 1,\n            \"text\": text,\n            \"method\": method\n        })\n    \n    doc.close()\n    return {\n        \"pages\": len(results),\n        \"content\": results,\n        \"word_count\": sum(len(r[\"text\"].split()) for r in results)\n    }\n\n# Usage\nresult = extract_pdf(\"document.pdf\")\nprint(f\"Extracted {result['word_count']} words from {result['pages']} pages\")"
      },
      {
        "title": "Feedback",
        "body": "Useful? clawhub star extract-pdf-text\nStay updated: clawhub sync"
      }
    ],
    "body": "When to Use\n\nAgent needs to extract text from PDFs. Use PyMuPDF (fitz) for fast local extraction. Works with text-based documents, scanned pages with OCR, forms, and complex layouts.\n\nQuick Reference\nTopic\tFile\nCode examples\texamples.md\nOCR setup\tocr.md\nTroubleshooting\ttroubleshooting.md\nCore Rules\n1. Install PyMuPDF First\npip install PyMuPDF\n\n\nImport as fitz (historical name):\n\nimport fitz  # PyMuPDF\n\n2. Basic Text Extraction\nimport fitz\n\ndoc = fitz.open(\"document.pdf\")\ntext = \"\"\nfor page in doc:\n    text += page.get_text()\ndoc.close()\n\n3. Pick the Right Method\nPDF Type\tMethod\nText-based\tpage.get_text() — fast, accurate\nScanned\tOCR with pytesseract — slower\nMixed\tCheck each page, use OCR when needed\n4. Check for Text Before OCR\ndef needs_ocr(page):\n    text = page.get_text().strip()\n    return len(text) < 50  # Likely scanned if very little text\n\n5. Handle Errors Gracefully\ntry:\n    doc = fitz.open(path)\nexcept fitz.FileDataError:\n    print(\"Invalid or corrupted PDF\")\nexcept fitz.PasswordError:\n    doc = fitz.open(path, password=\"secret\")\n\nExtraction Traps\nTrap\tWhat Happens\tFix\nOCR on text PDF\tSlow + worse accuracy\tCheck get_text() first\nForget to close doc\tMemory leak\tUse with or doc.close()\nAssume page order\tWrong reading flow\tUse sort=True in get_text()\nIgnore encoding\tGarbled characters\tPyMuPDF handles UTF-8\nScope\n\nThis skill provides instructions for using PyMuPDF to extract PDF text.\n\nThis skill ONLY:\n\nGives code examples for PyMuPDF\nExplains OCR setup when needed\nTroubleshoots common issues\n\nThis skill NEVER:\n\nAccesses files without user request\nSends data externally\nModifies original PDFs\nSecurity & Privacy\n\nAll processing is local:\n\nPyMuPDF runs entirely on your machine\nNo external API calls\nNo data leaves your system\nOutput Formats\nPlain Text\ntext = page.get_text()\n\nStructured (dict)\nblocks = page.get_text(\"dict\")[\"blocks\"]\nfor b in blocks:\n    if b[\"type\"] == 0:  # text block\n        for line in b[\"lines\"]:\n            for span in line[\"spans\"]:\n                print(span[\"text\"], span[\"size\"])\n\nJSON\nimport json\ndata = page.get_text(\"json\")\nparsed = json.loads(data)\n\nFull Example\nimport fitz\n\ndef extract_pdf(path):\n    \"\"\"Extract text from PDF, with OCR fallback for scanned pages.\"\"\"\n    doc = fitz.open(path)\n    results = []\n    \n    for i, page in enumerate(doc):\n        text = page.get_text()\n        method = \"text\"\n        \n        # If very little text, might be scanned\n        if len(text.strip()) < 50:\n            # OCR would go here (see ocr.md)\n            method = \"needs_ocr\"\n        \n        results.append({\n            \"page\": i + 1,\n            \"text\": text,\n            \"method\": method\n        })\n    \n    doc.close()\n    return {\n        \"pages\": len(results),\n        \"content\": results,\n        \"word_count\": sum(len(r[\"text\"].split()) for r in results)\n    }\n\n# Usage\nresult = extract_pdf(\"document.pdf\")\nprint(f\"Extracted {result['word_count']} words from {result['pages']} pages\")\n\nFeedback\nUseful? clawhub star extract-pdf-text\nStay updated: clawhub sync"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/ivangdavila/extract-pdf-text",
    "publisherUrl": "https://clawhub.ai/ivangdavila/extract-pdf-text",
    "owner": "ivangdavila",
    "version": "1.0.2",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/extract-pdf-text",
    "downloadUrl": "https://openagent3.xyz/downloads/extract-pdf-text",
    "agentUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent",
    "manifestUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/extract-pdf-text/agent.md"
  }
}