{
  "schemaVersion": "1.0",
  "item": {
    "slug": "data-scraper",
    "name": "Data Scraper",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/mupengi-bot/data-scraper",
    "canonicalUrl": "https://clawhub.ai/mupengi-bot/data-scraper",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/data-scraper",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-scraper",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "GUIDE.md",
      "SKILL.md",
      "run.sh"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "slug": "data-scraper",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-05-02T08:19:09.095Z",
      "expiresAt": "2026-05-09T08:19:09.095Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-scraper",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-scraper",
        "contentDisposition": "attachment; filename=\"data-scraper-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null,
        "slug": "data-scraper"
      },
      "scope": "item",
      "summary": "Item download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this item.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/data-scraper"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/data-scraper",
    "agentPageUrl": "https://openagent3.xyz/skills/data-scraper/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-scraper/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-scraper/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "data-scraper",
        "body": "Web Data Scraper — Extract structured data from web pages using curl + parsing. Lightweight, no browser required. Supports HTML-to-text, table extraction, price monitoring, and batch scraping."
      },
      {
        "title": "When to Use",
        "body": "Extract text content from web pages (articles, blogs, docs)\nScrape product prices, reviews, or listings\nMonitor pages for changes (price drops, new content)\nBatch-collect data from multiple URLs\nConvert HTML tables to structured formats (JSON/CSV)"
      },
      {
        "title": "Quick Start",
        "body": "# Extract readable text from URL\ndata-scraper fetch \"https://example.com/article\"\n\n# Extract specific elements\ndata-scraper extract \"https://example.com\" --selector \"h2, .price\"\n\n# Monitor for changes\ndata-scraper watch \"https://example.com/product\" --interval 3600"
      },
      {
        "title": "Text Mode (default)",
        "body": "Fetches page and extracts readable content, stripping HTML tags, scripts, and styles. Similar to reader mode.\n\ndata-scraper fetch URL\n# Output: clean markdown text"
      },
      {
        "title": "Selector Mode",
        "body": "Target specific CSS selectors for precise extraction.\n\ndata-scraper extract URL --selector \".product-title, .price, .rating\"\n# Output: matched elements as structured data"
      },
      {
        "title": "Table Mode",
        "body": "Extract HTML tables into structured formats.\n\ndata-scraper table URL --index 0\n# Output: JSON array of row objects (header → value mapping)"
      },
      {
        "title": "Link Mode",
        "body": "Extract all links from a page with optional filtering.\n\ndata-scraper links URL --filter \"*.pdf\"\n# Output: filtered list of absolute URLs"
      },
      {
        "title": "Batch Scraping",
        "body": "# Scrape multiple URLs\ndata-scraper batch urls.txt --output results/\n\n# With rate limiting\ndata-scraper batch urls.txt --delay 2000 --output results/\n\nurls.txt format:\n\nhttps://site1.com/page\nhttps://site2.com/page\nhttps://site3.com/page"
      },
      {
        "title": "Change Monitoring",
        "body": "# Watch for changes, alert on diff\ndata-scraper watch URL --selector \".price\" --interval 3600\n\n# Compare with previous snapshot\ndata-scraper diff URL\n\nStores snapshots in data-scraper/snapshots/ with timestamps. Alerts via notification-hub when changes detected."
      },
      {
        "title": "Output Formats",
        "body": "FormatFlagUse CaseText--format textReading, summarizationJSON--format jsonData processingCSV--format csvSpreadsheetsMarkdown--format mdDocumentation"
      },
      {
        "title": "Headers & Auth",
        "body": "# Custom headers\ndata-scraper fetch URL --header \"Authorization: Bearer TOKEN\"\n\n# Cookie-based auth\ndata-scraper fetch URL --cookie \"session=abc123\"\n\n# User-Agent override\ndata-scraper fetch URL --ua \"Mozilla/5.0...\""
      },
      {
        "title": "Rate Limiting & Ethics",
        "body": "Default: 1 request per second per domain\nRespects robots.txt when --polite flag is set\nConfigurable delay between requests\nStops on 429 (Too Many Requests) and backs off"
      },
      {
        "title": "Error Handling",
        "body": "ErrorBehavior404Log and skip403/401Warn about auth requirement429Exponential backoff (max 3 retries)TimeoutRetry once with longer timeoutSSL errorWarn, option to proceed with --insecure"
      },
      {
        "title": "Integration",
        "body": "web-claude: Use as fallback when web_fetch isn't enough\ncompetitor-watch: Feed scraped data into competitor analysis\nseo-audit: Scrape competitor pages for SEO comparison\nperformance-tracker: Collect social metrics from public profiles"
      }
    ],
    "body": "data-scraper\n\nWeb Data Scraper — Extract structured data from web pages using curl + parsing. Lightweight, no browser required. Supports HTML-to-text, table extraction, price monitoring, and batch scraping.\n\nWhen to Use\nExtract text content from web pages (articles, blogs, docs)\nScrape product prices, reviews, or listings\nMonitor pages for changes (price drops, new content)\nBatch-collect data from multiple URLs\nConvert HTML tables to structured formats (JSON/CSV)\nQuick Start\n# Extract readable text from URL\ndata-scraper fetch \"https://example.com/article\"\n\n# Extract specific elements\ndata-scraper extract \"https://example.com\" --selector \"h2, .price\"\n\n# Monitor for changes\ndata-scraper watch \"https://example.com/product\" --interval 3600\n\nExtraction Modes\nText Mode (default)\n\nFetches page and extracts readable content, stripping HTML tags, scripts, and styles. Similar to reader mode.\n\ndata-scraper fetch URL\n# Output: clean markdown text\n\nSelector Mode\n\nTarget specific CSS selectors for precise extraction.\n\ndata-scraper extract URL --selector \".product-title, .price, .rating\"\n# Output: matched elements as structured data\n\nTable Mode\n\nExtract HTML tables into structured formats.\n\ndata-scraper table URL --index 0\n# Output: JSON array of row objects (header → value mapping)\n\nLink Mode\n\nExtract all links from a page with optional filtering.\n\ndata-scraper links URL --filter \"*.pdf\"\n# Output: filtered list of absolute URLs\n\nBatch Scraping\n# Scrape multiple URLs\ndata-scraper batch urls.txt --output results/\n\n# With rate limiting\ndata-scraper batch urls.txt --delay 2000 --output results/\n\n\nurls.txt format:\n\nhttps://site1.com/page\nhttps://site2.com/page\nhttps://site3.com/page\n\nChange Monitoring\n# Watch for changes, alert on diff\ndata-scraper watch URL --selector \".price\" --interval 3600\n\n# Compare with previous snapshot\ndata-scraper diff URL\n\n\nStores snapshots in data-scraper/snapshots/ with timestamps. Alerts via notification-hub when changes detected.\n\nOutput Formats\nFormat\tFlag\tUse Case\nText\t--format text\tReading, summarization\nJSON\t--format json\tData processing\nCSV\t--format csv\tSpreadsheets\nMarkdown\t--format md\tDocumentation\nHeaders & Auth\n# Custom headers\ndata-scraper fetch URL --header \"Authorization: Bearer TOKEN\"\n\n# Cookie-based auth\ndata-scraper fetch URL --cookie \"session=abc123\"\n\n# User-Agent override\ndata-scraper fetch URL --ua \"Mozilla/5.0...\"\n\nRate Limiting & Ethics\nDefault: 1 request per second per domain\nRespects robots.txt when --polite flag is set\nConfigurable delay between requests\nStops on 429 (Too Many Requests) and backs off\nError Handling\nError\tBehavior\n404\tLog and skip\n403/401\tWarn about auth requirement\n429\tExponential backoff (max 3 retries)\nTimeout\tRetry once with longer timeout\nSSL error\tWarn, option to proceed with --insecure\nIntegration\nweb-claude: Use as fallback when web_fetch isn't enough\ncompetitor-watch: Feed scraped data into competitor analysis\nseo-audit: Scrape competitor pages for SEO comparison\nperformance-tracker: Collect social metrics from public profiles"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/mupengi-bot/data-scraper",
    "publisherUrl": "https://clawhub.ai/mupengi-bot/data-scraper",
    "owner": "mupengi-bot",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/data-scraper",
    "downloadUrl": "https://openagent3.xyz/downloads/data-scraper",
    "agentUrl": "https://openagent3.xyz/skills/data-scraper/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-scraper/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-scraper/agent.md"
  }
}