{
  "schemaVersion": "1.0",
  "item": {
    "slug": "pdf-text-extractor",
    "name": "PDF Text Extractor",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/Michael-laffin/pdf-text-extractor",
    "canonicalUrl": "https://clawhub.ai/Michael-laffin/pdf-text-extractor",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/pdf-text-extractor",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=pdf-text-extractor",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md",
      "config.json",
      "index.js",
      "package-lock.json",
      "package.json"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/pdf-text-extractor"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/pdf-text-extractor",
    "agentPageUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent",
    "manifestUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "PDF-Text-Extractor - Extract Text from PDFs",
        "body": "Vernox Utility Skill - Perfect for document digitization."
      },
      {
        "title": "Overview",
        "body": "PDF-Text-Extractor is a zero-dependency tool for extracting text content from PDF files. Supports both embedded text extraction (for text-based PDFs) and OCR (for scanned documents)."
      },
      {
        "title": "✅ Text Extraction",
        "body": "Extract text from PDFs without external tools\nSupport for both text-based and scanned PDFs\nPreserve document structure and formatting\nFast extraction (milliseconds for text-based)"
      },
      {
        "title": "✅ OCR Support",
        "body": "Use Tesseract.js for scanned documents\nSupport multiple languages (English, Spanish, French, German)\nConfigurable OCR quality/speed\nFallback to text extraction when possible"
      },
      {
        "title": "✅ Batch Processing",
        "body": "Process multiple PDFs at once\nBatch extraction for document workflows\nProgress tracking for large files\nError handling and retry logic"
      },
      {
        "title": "✅ Output Options",
        "body": "Plain text output\nJSON output with metadata\nMarkdown conversion\nHTML output (preserving links)"
      },
      {
        "title": "✅ Utility Features",
        "body": "Page-by-page extraction\nCharacter/word counting\nLanguage detection\nMetadata extraction (author, title, creation date)"
      },
      {
        "title": "Installation",
        "body": "clawhub install pdf-text-extractor"
      },
      {
        "title": "Extract Text from PDF",
        "body": "const result = await extractText({\n  pdfPath: './document.pdf',\n  options: {\n    outputFormat: 'text',\n    ocr: true,\n    language: 'eng'\n  }\n});\n\nconsole.log(result.text);\nconsole.log(`Pages: ${result.pages}`);\nconsole.log(`Words: ${result.wordCount}`);"
      },
      {
        "title": "Batch Extract Multiple PDFs",
        "body": "const results = await extractBatch({\n  pdfFiles: [\n    './document1.pdf',\n    './document2.pdf',\n    './document3.pdf'\n  ],\n  options: {\n    outputFormat: 'json',\n    ocr: true\n  }\n});\n\nconsole.log(`Extracted ${results.length} PDFs`);"
      },
      {
        "title": "Extract with OCR",
        "body": "const result = await extractText({\n  pdfPath: './scanned-document.pdf',\n  options: {\n    ocr: true,\n    language: 'eng',\n    ocrQuality: 'high'\n  }\n});\n\n// OCR will be used (scanned document detected)"
      },
      {
        "title": "extractText",
        "body": "Extract text content from a single PDF file.\n\nParameters:\n\npdfPath (string, required): Path to PDF file\noptions (object, optional): Extraction options\n\noutputFormat (string): 'text' | 'json' | 'markdown' | 'html'\nocr (boolean): Enable OCR for scanned docs\nlanguage (string): OCR language code ('eng', 'spa', 'fra', 'deu')\npreserveFormatting (boolean): Keep headings/structure\nminConfidence (number): Minimum OCR confidence score (0-100)\n\nReturns:\n\ntext (string): Extracted text content\npages (number): Number of pages processed\nwordCount (number): Total word count\ncharCount (number): Total character count\nlanguage (string): Detected language\nmetadata (object): PDF metadata (title, author, creation date)\nmethod (string): 'text' or 'ocr' (extraction method)"
      },
      {
        "title": "extractBatch",
        "body": "Extract text from multiple PDF files at once.\n\nParameters:\n\npdfFiles (array, required): Array of PDF file paths\noptions (object, optional): Same as extractText\n\nReturns:\n\nresults (array): Array of extraction results\ntotalPages (number): Total pages across all PDFs\nsuccessCount (number): Successfully extracted\nfailureCount (number): Failed extractions\nerrors (array): Error details for failures"
      },
      {
        "title": "countWords",
        "body": "Count words in extracted text.\n\nParameters:\n\ntext (string, required): Text to count\noptions (object, optional):\n\nminWordLength (number): Minimum characters per word (default: 3)\nexcludeNumbers (boolean): Don't count numbers as words\ncountByPage (boolean): Return word count per page\n\nReturns:\n\nwordCount (number): Total word count\ncharCount (number): Total character count\npageCounts (array): Word count per page\naverageWordsPerPage (number): Average words per page"
      },
      {
        "title": "detectLanguage",
        "body": "Detect the language of extracted text.\n\nParameters:\n\ntext (string, required): Text to analyze\nminConfidence (number): Minimum confidence for detection\n\nReturns:\n\nlanguage (string): Detected language code\nlanguageName (string): Full language name\nconfidence (number): Confidence score (0-100)"
      },
      {
        "title": "Document Digitization",
        "body": "Convert paper documents to digital text\nProcess invoices and receipts\nDigitize contracts and agreements\nArchive physical documents"
      },
      {
        "title": "Content Analysis",
        "body": "Extract text for analysis tools\nPrepare content for LLM processing\nClean up scanned documents\nParse PDF-based reports"
      },
      {
        "title": "Data Extraction",
        "body": "Extract data from PDF reports\nParse tables from PDFs\nPull structured data\nAutomate document workflows"
      },
      {
        "title": "Text Processing",
        "body": "Prepare content for translation\nClean up OCR output\nExtract specific sections\nSearch within PDF content"
      },
      {
        "title": "Text-Based PDFs",
        "body": "Speed: ~100ms for 10-page PDF\nAccuracy: 100% (exact text)\nMemory: ~10MB for typical document"
      },
      {
        "title": "OCR Processing",
        "body": "Speed: ~1-3s per page (high quality)\nAccuracy: 85-95% (depends on scan quality)\nMemory: ~50-100MB peak during OCR"
      },
      {
        "title": "PDF Parsing",
        "body": "Uses native PDF.js library\nExtracts text layer directly (no OCR needed)\nPreserves document structure\nHandles password-protected PDFs"
      },
      {
        "title": "OCR Engine",
        "body": "Tesseract.js under the hood\nSupports 100+ languages\nAdjustable quality/speed tradeoff\nConfidence scoring for accuracy"
      },
      {
        "title": "Dependencies",
        "body": "ZERO external dependencies\nUses Node.js built-in modules only\nPDF.js included in skill\nTesseract.js bundled"
      },
      {
        "title": "Invalid PDF",
        "body": "Clear error message\nSuggest fix (check file format)\nSkip to next file in batch"
      },
      {
        "title": "OCR Failure",
        "body": "Report confidence score\nSuggest rescan at higher quality\nFallback to basic extraction"
      },
      {
        "title": "Memory Issues",
        "body": "Stream processing for large files\nProgress reporting\nGraceful degradation"
      },
      {
        "title": "Edit config.json:",
        "body": "{\n  \"ocr\": {\n    \"enabled\": true,\n    \"defaultLanguage\": \"eng\",\n    \"quality\": \"medium\",\n    \"languages\": [\"eng\", \"spa\", \"fra\", \"deu\"]\n  },\n  \"output\": {\n    \"defaultFormat\": \"text\",\n    \"preserveFormatting\": true,\n    \"includeMetadata\": true\n  },\n  \"batch\": {\n    \"maxConcurrent\": 3,\n    \"timeoutSeconds\": 30\n  }\n}"
      },
      {
        "title": "Extract from Invoice",
        "body": "const invoice = await extractText('./invoice.pdf');\nconsole.log(invoice.text);\n// \"INVOICE #12345 Date: 2026-02-04...\""
      },
      {
        "title": "Extract from Scanned Contract",
        "body": "const contract = await extractText('./scanned-contract.pdf', {\n  ocr: true,\n  language: 'eng',\n  ocrQuality: 'high'\n});\nconsole.log(contract.text);\n// \"AGREEMENT This contract between...\""
      },
      {
        "title": "Batch Process Documents",
        "body": "const docs = await extractBatch([\n  './doc1.pdf',\n  './doc2.pdf',\n  './doc3.pdf',\n  './doc4.pdf'\n]);\nconsole.log(`Processed ${docs.successCount}/${docs.results.length} documents`);"
      },
      {
        "title": "OCR Not Working",
        "body": "Check if PDF is truly scanned (not text-based)\nTry different quality settings (low/medium/high)\nEnsure language matches document\nCheck image quality of scan"
      },
      {
        "title": "Extraction Returns Empty",
        "body": "PDF may be image-only\nOCR failed with low confidence\nTry different language setting"
      },
      {
        "title": "Slow Processing",
        "body": "Large PDF takes longer\nReduce quality for speed\nProcess in smaller batches"
      },
      {
        "title": "Best Results",
        "body": "Use text-based PDFs when possible (faster, 100% accurate)\nHigh-quality scans for OCR (300 DPI+)\nClean background before scanning\nUse correct language setting"
      },
      {
        "title": "Performance Optimization",
        "body": "Batch processing for multiple files\nDisable OCR for text-based PDFs\nLower OCR quality for speed when acceptable"
      },
      {
        "title": "Roadmap",
        "body": "PDF/A support\n Advanced OCR pre-processing\n Table extraction from OCR\n Handwriting OCR\n PDF form field extraction\n Batch language detection\n Confidence scoring visualization"
      },
      {
        "title": "License",
        "body": "MIT\n\nExtract text from PDFs. Fast, accurate, zero dependencies. 🔮"
      }
    ],
    "body": "PDF-Text-Extractor - Extract Text from PDFs\n\nVernox Utility Skill - Perfect for document digitization.\n\nOverview\n\nPDF-Text-Extractor is a zero-dependency tool for extracting text content from PDF files. Supports both embedded text extraction (for text-based PDFs) and OCR (for scanned documents).\n\nFeatures\n✅ Text Extraction\nExtract text from PDFs without external tools\nSupport for both text-based and scanned PDFs\nPreserve document structure and formatting\nFast extraction (milliseconds for text-based)\n✅ OCR Support\nUse Tesseract.js for scanned documents\nSupport multiple languages (English, Spanish, French, German)\nConfigurable OCR quality/speed\nFallback to text extraction when possible\n✅ Batch Processing\nProcess multiple PDFs at once\nBatch extraction for document workflows\nProgress tracking for large files\nError handling and retry logic\n✅ Output Options\nPlain text output\nJSON output with metadata\nMarkdown conversion\nHTML output (preserving links)\n✅ Utility Features\nPage-by-page extraction\nCharacter/word counting\nLanguage detection\nMetadata extraction (author, title, creation date)\nInstallation\nclawhub install pdf-text-extractor\n\nQuick Start\nExtract Text from PDF\nconst result = await extractText({\n  pdfPath: './document.pdf',\n  options: {\n    outputFormat: 'text',\n    ocr: true,\n    language: 'eng'\n  }\n});\n\nconsole.log(result.text);\nconsole.log(`Pages: ${result.pages}`);\nconsole.log(`Words: ${result.wordCount}`);\n\nBatch Extract Multiple PDFs\nconst results = await extractBatch({\n  pdfFiles: [\n    './document1.pdf',\n    './document2.pdf',\n    './document3.pdf'\n  ],\n  options: {\n    outputFormat: 'json',\n    ocr: true\n  }\n});\n\nconsole.log(`Extracted ${results.length} PDFs`);\n\nExtract with OCR\nconst result = await extractText({\n  pdfPath: './scanned-document.pdf',\n  options: {\n    ocr: true,\n    language: 'eng',\n    ocrQuality: 'high'\n  }\n});\n\n// OCR will be used (scanned document detected)\n\nTool Functions\nextractText\n\nExtract text content from a single PDF file.\n\nParameters:\n\npdfPath (string, required): Path to PDF file\noptions (object, optional): Extraction options\noutputFormat (string): 'text' | 'json' | 'markdown' | 'html'\nocr (boolean): Enable OCR for scanned docs\nlanguage (string): OCR language code ('eng', 'spa', 'fra', 'deu')\npreserveFormatting (boolean): Keep headings/structure\nminConfidence (number): Minimum OCR confidence score (0-100)\n\nReturns:\n\ntext (string): Extracted text content\npages (number): Number of pages processed\nwordCount (number): Total word count\ncharCount (number): Total character count\nlanguage (string): Detected language\nmetadata (object): PDF metadata (title, author, creation date)\nmethod (string): 'text' or 'ocr' (extraction method)\nextractBatch\n\nExtract text from multiple PDF files at once.\n\nParameters:\n\npdfFiles (array, required): Array of PDF file paths\noptions (object, optional): Same as extractText\n\nReturns:\n\nresults (array): Array of extraction results\ntotalPages (number): Total pages across all PDFs\nsuccessCount (number): Successfully extracted\nfailureCount (number): Failed extractions\nerrors (array): Error details for failures\ncountWords\n\nCount words in extracted text.\n\nParameters:\n\ntext (string, required): Text to count\noptions (object, optional):\nminWordLength (number): Minimum characters per word (default: 3)\nexcludeNumbers (boolean): Don't count numbers as words\ncountByPage (boolean): Return word count per page\n\nReturns:\n\nwordCount (number): Total word count\ncharCount (number): Total character count\npageCounts (array): Word count per page\naverageWordsPerPage (number): Average words per page\ndetectLanguage\n\nDetect the language of extracted text.\n\nParameters:\n\ntext (string, required): Text to analyze\nminConfidence (number): Minimum confidence for detection\n\nReturns:\n\nlanguage (string): Detected language code\nlanguageName (string): Full language name\nconfidence (number): Confidence score (0-100)\nUse Cases\nDocument Digitization\nConvert paper documents to digital text\nProcess invoices and receipts\nDigitize contracts and agreements\nArchive physical documents\nContent Analysis\nExtract text for analysis tools\nPrepare content for LLM processing\nClean up scanned documents\nParse PDF-based reports\nData Extraction\nExtract data from PDF reports\nParse tables from PDFs\nPull structured data\nAutomate document workflows\nText Processing\nPrepare content for translation\nClean up OCR output\nExtract specific sections\nSearch within PDF content\nPerformance\nText-Based PDFs\nSpeed: ~100ms for 10-page PDF\nAccuracy: 100% (exact text)\nMemory: ~10MB for typical document\nOCR Processing\nSpeed: ~1-3s per page (high quality)\nAccuracy: 85-95% (depends on scan quality)\nMemory: ~50-100MB peak during OCR\nTechnical Details\nPDF Parsing\nUses native PDF.js library\nExtracts text layer directly (no OCR needed)\nPreserves document structure\nHandles password-protected PDFs\nOCR Engine\nTesseract.js under the hood\nSupports 100+ languages\nAdjustable quality/speed tradeoff\nConfidence scoring for accuracy\nDependencies\nZERO external dependencies\nUses Node.js built-in modules only\nPDF.js included in skill\nTesseract.js bundled\nError Handling\nInvalid PDF\nClear error message\nSuggest fix (check file format)\nSkip to next file in batch\nOCR Failure\nReport confidence score\nSuggest rescan at higher quality\nFallback to basic extraction\nMemory Issues\nStream processing for large files\nProgress reporting\nGraceful degradation\nConfiguration\nEdit config.json:\n{\n  \"ocr\": {\n    \"enabled\": true,\n    \"defaultLanguage\": \"eng\",\n    \"quality\": \"medium\",\n    \"languages\": [\"eng\", \"spa\", \"fra\", \"deu\"]\n  },\n  \"output\": {\n    \"defaultFormat\": \"text\",\n    \"preserveFormatting\": true,\n    \"includeMetadata\": true\n  },\n  \"batch\": {\n    \"maxConcurrent\": 3,\n    \"timeoutSeconds\": 30\n  }\n}\n\nExamples\nExtract from Invoice\nconst invoice = await extractText('./invoice.pdf');\nconsole.log(invoice.text);\n// \"INVOICE #12345 Date: 2026-02-04...\"\n\nExtract from Scanned Contract\nconst contract = await extractText('./scanned-contract.pdf', {\n  ocr: true,\n  language: 'eng',\n  ocrQuality: 'high'\n});\nconsole.log(contract.text);\n// \"AGREEMENT This contract between...\"\n\nBatch Process Documents\nconst docs = await extractBatch([\n  './doc1.pdf',\n  './doc2.pdf',\n  './doc3.pdf',\n  './doc4.pdf'\n]);\nconsole.log(`Processed ${docs.successCount}/${docs.results.length} documents`);\n\nTroubleshooting\nOCR Not Working\nCheck if PDF is truly scanned (not text-based)\nTry different quality settings (low/medium/high)\nEnsure language matches document\nCheck image quality of scan\nExtraction Returns Empty\nPDF may be image-only\nOCR failed with low confidence\nTry different language setting\nSlow Processing\nLarge PDF takes longer\nReduce quality for speed\nProcess in smaller batches\nTips\nBest Results\nUse text-based PDFs when possible (faster, 100% accurate)\nHigh-quality scans for OCR (300 DPI+)\nClean background before scanning\nUse correct language setting\nPerformance Optimization\nBatch processing for multiple files\nDisable OCR for text-based PDFs\nLower OCR quality for speed when acceptable\nRoadmap\n PDF/A support\n Advanced OCR pre-processing\n Table extraction from OCR\n Handwriting OCR\n PDF form field extraction\n Batch language detection\n Confidence scoring visualization\nLicense\n\nMIT\n\nExtract text from PDFs. Fast, accurate, zero dependencies. 🔮"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/Michael-laffin/pdf-text-extractor",
    "publisherUrl": "https://clawhub.ai/Michael-laffin/pdf-text-extractor",
    "owner": "Michael-laffin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/pdf-text-extractor",
    "downloadUrl": "https://openagent3.xyz/downloads/pdf-text-extractor",
    "agentUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent",
    "manifestUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/pdf-text-extractor/agent.md"
  }
}