{
  "schemaVersion": "1.0",
  "item": {
    "slug": "pdf-parser-mineru",
    "name": "pdf-parser-mineru",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/baokui/pdf-parser-mineru",
    "canonicalUrl": "https://clawhub.ai/baokui/pdf-parser-mineru",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/pdf-parser-mineru",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=pdf-parser-mineru",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "install.sh",
      "SKILL.md",
      "SKILL_zh.md",
      "script/pdf_parser.py"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-05-07T17:22:31.273Z",
      "expiresAt": "2026-05-14T17:22:31.273Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
        "contentDisposition": "attachment; filename=\"afrexai-annual-report-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/pdf-parser-mineru"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/pdf-parser-mineru",
    "agentPageUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent",
    "manifestUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "1. pdf_to_markdown",
        "body": "Convert PDF documents to Markdown format, preserving document structure, formulas, tables, and images.\n\nDescription: Use MinerU to parse PDF documents and output in Markdown format, supporting OCR, formula recognition, table extraction, and other features.\n\nParameters:\n\nfile_path (string, required): Absolute path to the PDF file\noutput_dir (string, required): Absolute path to the output directory\nbackend (string, optional): Parsing backend, options: hybrid-auto-engine (default), pipeline, vlm-auto-engine\nlanguage (string, optional): OCR language code, such as en (English), ch (Chinese), ja (Japanese), etc., defaults to auto-detection\nenable_formula (boolean, optional): Whether to enable formula recognition, defaults to true\nenable_table (boolean, optional): Whether to enable table extraction, defaults to true\nstart_page (integer, optional): Start page number (starting from 0), defaults to 0\nend_page (integer, optional): End page number (starting from 0), defaults to -1 meaning parse all pages\n\nReturn Value:\n\n{\n  \"success\": true,\n  \"output_path\": \"/path/to/output\",\n  \"markdown_content\": \"Converted Markdown content...\",\n  \"images\": [\"List of image paths\"],\n  \"tables\": [\"List of table information\"],\n  \"formula_count\": 10\n}\n\nExamples:\n\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\"}}'\n\n# Use specific backend\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"backend\": \"pipeline\"}}'\n\n# Parse specific pages\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"start_page\": 0, \"end_page\": 5}}'"
      },
      {
        "title": "2. pdf_to_json",
        "body": "Convert PDF documents to JSON format, including detailed layout and structural information.\n\nDescription: Use MinerU to parse PDF documents and output in JSON format, containing structured information such as text blocks, images, tables, formulas, etc.\n\nParameters:\n\nfile_path (string, required): Absolute path to the PDF file\noutput_dir (string, required): Absolute path to the output directory\nbackend (string, optional): Parsing backend, options: hybrid-auto-engine (default), pipeline, vlm-auto-engine\nlanguage (string, optional): OCR language code, such as en (English), ch (Chinese), ja (Japanese), etc., defaults to auto-detection\nenable_formula (boolean, optional): Whether to enable formula recognition, defaults to true\nenable_table (boolean, optional): Whether to enable table extraction, defaults to true\nstart_page (integer, optional): Start page number (starting from 0), defaults to 0\nend_page (integer, optional): End page number (starting from 0), defaults to -1 meaning parse all pages\n\nReturn Value:\n\n{\n  \"success\": true,\n  \"output_path\": \"/path/to/output.json\",\n  \"pages\": [\n    {\n      \"page_no\": 0,\n      \"page_size\": [595, 842],\n      \"blocks\": [\n        {\n          \"type\": \"text\",\n          \"text\": \"Text content\",\n          \"bbox\": [x, y, x, y]\n        }\n      ],\n      \"images\": [],\n      \"tables\": [],\n      \"formulas\": []\n    }\n  ],\n  \"metadata\": {\n    \"total_pages\": 10,\n    \"author\": \"Author\",\n    \"title\": \"Title\"\n  }\n}\n\nExamples:\n\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_json\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\"}}'\n\n# Use specific backend and language\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_json\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"backend\": \"hybrid-auto-engine\", \"language\": \"ch\"}}'"
      },
      {
        "title": "1. Install MinerU",
        "body": "# Update pip and install uv\npip install --upgrade pip\npip install uv\n\n# Install MinerU (including all features)\nuv pip install -U \"mineru[all]\""
      },
      {
        "title": "2. Verify Installation",
        "body": "# Check if MinerU is installed successfully\nmineru --version\n\n# Test basic functionality\nmineru --help"
      },
      {
        "title": "3. System Requirements",
        "body": "Python Version: 3.10-3.13\nOperating System: Linux / Windows / macOS 14.0+\nMemory:\n\nUsing pipeline backend: minimum 16GB, recommended 32GB+\nUsing hybrid/vlm backend: minimum 16GB, recommended 32GB+\n\n\nDisk Space: minimum 20GB (SSD recommended)\nGPU (optional):\n\npipeline backend: supports CPU-only\nhybrid/vlm backend: requires NVIDIA GPU (Volta architecture and above) or Apple Silicon"
      },
      {
        "title": "Use Cases",
        "body": "Academic Paper Parsing: Extract structured content such as formulas, tables, and images\nTechnical Document Conversion: Convert PDF documents to Markdown for version control and online publishing\nOCR Processing: Process scanned PDFs and garbled PDFs\nMultilingual Documents: Supports OCR recognition for 109 languages\nBatch Processing: Batch convert multiple PDF documents"
      },
      {
        "title": "Backend Selection Recommendations",
        "body": "hybrid-auto-engine (default): Balanced accuracy and speed, suitable for most scenarios\npipeline: Suitable for CPU-only environments, best compatibility\nvlm-auto-engine: Highest accuracy, requires GPU acceleration"
      },
      {
        "title": "Notes",
        "body": "File Paths: All paths must be absolute paths\nOutput Directory: Non-existent directories will be created automatically\nPerformance: Using GPU can significantly improve parsing speed\nPage Numbers: Page numbers start counting from 0\nMemory: Processing large documents may consume more memory"
      },
      {
        "title": "Common Issues",
        "body": "Installation Failure:\n\nEnsure using Python 3.10-3.13\nWindows only supports Python 3.10-3.12 (ray does not support 3.13)\nUsing uv pip install can resolve most dependency conflicts\n\n\n\nInsufficient Memory:\n\nUse pipeline backend\nLimit parsing pages: start_page and end_page\nReduce virtual memory allocation\n\n\n\nSlow Parsing Speed:\n\nEnable GPU acceleration\nUse hybrid-auto-engine backend\nDisable unnecessary features (formulas, tables)\n\n\n\nLow OCR Accuracy:\n\nSpecify the correct document language\nEnsure the backend supports OCR (use pipeline or hybrid-*)"
      },
      {
        "title": "Related Resources",
        "body": "MinerU Official Documentation: https://opendatalab.github.io/MinerU/\nMinerU GitHub: https://github.com/opendatalab/MinerU\nOnline Demo: https://mineru.net/"
      }
    ],
    "body": "Tool List\n1. pdf_to_markdown\n\nConvert PDF documents to Markdown format, preserving document structure, formulas, tables, and images.\n\nDescription: Use MinerU to parse PDF documents and output in Markdown format, supporting OCR, formula recognition, table extraction, and other features.\n\nParameters:\n\nfile_path (string, required): Absolute path to the PDF file\noutput_dir (string, required): Absolute path to the output directory\nbackend (string, optional): Parsing backend, options: hybrid-auto-engine (default), pipeline, vlm-auto-engine\nlanguage (string, optional): OCR language code, such as en (English), ch (Chinese), ja (Japanese), etc., defaults to auto-detection\nenable_formula (boolean, optional): Whether to enable formula recognition, defaults to true\nenable_table (boolean, optional): Whether to enable table extraction, defaults to true\nstart_page (integer, optional): Start page number (starting from 0), defaults to 0\nend_page (integer, optional): End page number (starting from 0), defaults to -1 meaning parse all pages\n\nReturn Value:\n\n{\n  \"success\": true,\n  \"output_path\": \"/path/to/output\",\n  \"markdown_content\": \"Converted Markdown content...\",\n  \"images\": [\"List of image paths\"],\n  \"tables\": [\"List of table information\"],\n  \"formula_count\": 10\n}\n\n\nExamples:\n\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\"}}'\n\n# Use specific backend\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"backend\": \"pipeline\"}}'\n\n# Parse specific pages\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_markdown\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"start_page\": 0, \"end_page\": 5}}'\n\n2. pdf_to_json\n\nConvert PDF documents to JSON format, including detailed layout and structural information.\n\nDescription: Use MinerU to parse PDF documents and output in JSON format, containing structured information such as text blocks, images, tables, formulas, etc.\n\nParameters:\n\nfile_path (string, required): Absolute path to the PDF file\noutput_dir (string, required): Absolute path to the output directory\nbackend (string, optional): Parsing backend, options: hybrid-auto-engine (default), pipeline, vlm-auto-engine\nlanguage (string, optional): OCR language code, such as en (English), ch (Chinese), ja (Japanese), etc., defaults to auto-detection\nenable_formula (boolean, optional): Whether to enable formula recognition, defaults to true\nenable_table (boolean, optional): Whether to enable table extraction, defaults to true\nstart_page (integer, optional): Start page number (starting from 0), defaults to 0\nend_page (integer, optional): End page number (starting from 0), defaults to -1 meaning parse all pages\n\nReturn Value:\n\n{\n  \"success\": true,\n  \"output_path\": \"/path/to/output.json\",\n  \"pages\": [\n    {\n      \"page_no\": 0,\n      \"page_size\": [595, 842],\n      \"blocks\": [\n        {\n          \"type\": \"text\",\n          \"text\": \"Text content\",\n          \"bbox\": [x, y, x, y]\n        }\n      ],\n      \"images\": [],\n      \"tables\": [],\n      \"formulas\": []\n    }\n  ],\n  \"metadata\": {\n    \"total_pages\": 10,\n    \"author\": \"Author\",\n    \"title\": \"Title\"\n  }\n}\n\n\nExamples:\n\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_json\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\"}}'\n\n# Use specific backend and language\npython .claude/skills/pdf-process/script/pdf_parser.py \\\n  '{\"name\": \"pdf_to_json\", \"arguments\": {\"file_path\": \"/path/to/document.pdf\", \"output_dir\": \"/path/to/output\", \"backend\": \"hybrid-auto-engine\", \"language\": \"ch\"}}'\n\nInstallation Instructions\n1. Install MinerU\n# Update pip and install uv\npip install --upgrade pip\npip install uv\n\n# Install MinerU (including all features)\nuv pip install -U \"mineru[all]\"\n\n2. Verify Installation\n# Check if MinerU is installed successfully\nmineru --version\n\n# Test basic functionality\nmineru --help\n\n3. System Requirements\nPython Version: 3.10-3.13\nOperating System: Linux / Windows / macOS 14.0+\nMemory:\nUsing pipeline backend: minimum 16GB, recommended 32GB+\nUsing hybrid/vlm backend: minimum 16GB, recommended 32GB+\nDisk Space: minimum 20GB (SSD recommended)\nGPU (optional):\npipeline backend: supports CPU-only\nhybrid/vlm backend: requires NVIDIA GPU (Volta architecture and above) or Apple Silicon\nUse Cases\nAcademic Paper Parsing: Extract structured content such as formulas, tables, and images\nTechnical Document Conversion: Convert PDF documents to Markdown for version control and online publishing\nOCR Processing: Process scanned PDFs and garbled PDFs\nMultilingual Documents: Supports OCR recognition for 109 languages\nBatch Processing: Batch convert multiple PDF documents\nBackend Selection Recommendations\nhybrid-auto-engine (default): Balanced accuracy and speed, suitable for most scenarios\npipeline: Suitable for CPU-only environments, best compatibility\nvlm-auto-engine: Highest accuracy, requires GPU acceleration\nNotes\nFile Paths: All paths must be absolute paths\nOutput Directory: Non-existent directories will be created automatically\nPerformance: Using GPU can significantly improve parsing speed\nPage Numbers: Page numbers start counting from 0\nMemory: Processing large documents may consume more memory\nTroubleshooting\nCommon Issues\n\nInstallation Failure:\n\nEnsure using Python 3.10-3.13\nWindows only supports Python 3.10-3.12 (ray does not support 3.13)\nUsing uv pip install can resolve most dependency conflicts\n\nInsufficient Memory:\n\nUse pipeline backend\nLimit parsing pages: start_page and end_page\nReduce virtual memory allocation\n\nSlow Parsing Speed:\n\nEnable GPU acceleration\nUse hybrid-auto-engine backend\nDisable unnecessary features (formulas, tables)\n\nLow OCR Accuracy:\n\nSpecify the correct document language\nEnsure the backend supports OCR (use pipeline or hybrid-*)\nRelated Resources\nMinerU Official Documentation: https://opendatalab.github.io/MinerU/\nMinerU GitHub: https://github.com/opendatalab/MinerU\nOnline Demo: https://mineru.net/"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/baokui/pdf-parser-mineru",
    "publisherUrl": "https://clawhub.ai/baokui/pdf-parser-mineru",
    "owner": "baokui",
    "version": "1.0.2",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/pdf-parser-mineru",
    "downloadUrl": "https://openagent3.xyz/downloads/pdf-parser-mineru",
    "agentUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent",
    "manifestUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/pdf-parser-mineru/agent.md"
  }
}