{
  "schemaVersion": "1.0",
  "item": {
    "slug": "web-scraper-as-a-service",
    "name": "Web Scraper as a Service",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/seanwyngaard/web-scraper-as-a-service",
    "canonicalUrl": "https://clawhub.ai/seanwyngaard/web-scraper-as-a-service",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/web-scraper-as-a-service",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=web-scraper-as-a-service",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-05-07T17:22:31.273Z",
      "expiresAt": "2026-05-14T17:22:31.273Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
        "contentDisposition": "attachment; filename=\"afrexai-annual-report-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/web-scraper-as-a-service"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/web-scraper-as-a-service",
    "agentPageUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent",
    "manifestUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Web Scraper as a Service",
        "body": "Turn scraping briefs into deliverable scraping projects. Generates the scraper, runs it, cleans the data, and packages everything for the client."
      },
      {
        "title": "How to Use",
        "body": "/web-scraper-as-a-service \"Scrape all products from example-store.com — need name, price, description, images. CSV output.\"\n/web-scraper-as-a-service https://example.com --fields \"title,price,rating,url\" --format csv\n/web-scraper-as-a-service brief.txt"
      },
      {
        "title": "Step 1: Analyze the Target",
        "body": "Before writing any code:\n\nFetch the target URL to understand the page structure\nIdentify:\n\nIs the site server-rendered (static HTML) or client-rendered (JavaScript/SPA)?\nWhat anti-scraping measures are visible? (Cloudflare, CAPTCHAs, rate limits)\nPagination pattern (URL params, infinite scroll, load more button)\nData structure (product cards, table rows, list items)\nTotal estimated volume (number of pages/items)\n\n\nChoose the right tool:\n\nStatic HTML → Python + requests + BeautifulSoup\nJavaScript-rendered → Python + playwright\nAPI available → Direct API calls (check network tab patterns)"
      },
      {
        "title": "Step 2: Build the Scraper",
        "body": "Generate a complete Python script in scraper/ directory:\n\nscraper/\n  scrape.py           # Main scraper script\n  requirements.txt    # Dependencies\n  config.json         # Target URLs, fields, settings\n  README.md           # Setup and usage instructions for client\n\nscrape.py must include:\n\n# Required features in every scraper:\n\n# 1. Configuration\nimport json\nconfig = json.load(open('config.json'))\n\n# 2. Rate limiting (ALWAYS — be respectful)\nimport time\nDELAY_BETWEEN_REQUESTS = 2  # seconds, adjustable in config\n\n# 3. Retry logic\nMAX_RETRIES = 3\nRETRY_DELAY = 5\n\n# 4. User-Agent rotation\nUSER_AGENTS = [\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...\",\n    # ... at least 5 user agents\n]\n\n# 5. Progress tracking\nprint(f\"Scraping page {current}/{total} — {items_collected} items collected\")\n\n# 6. Error handling\n# - Log errors but don't crash on individual page failures\n# - Save progress incrementally (don't lose data on crash)\n# - Write errors to error_log.txt\n\n# 7. Output\n# - Save data incrementally (append to file, don't hold in memory)\n# - Support CSV and JSON output\n# - Clean and normalize data before saving\n\n# 8. Resume capability\n# - Track last successfully scraped page/URL\n# - Can resume from where it left off if interrupted"
      },
      {
        "title": "Step 3: Data Cleaning",
        "body": "After scraping, clean the data:\n\nRemove duplicates (by unique identifier or composite key)\nNormalize text (strip extra whitespace, fix encoding issues, consistent capitalization)\nValidate data (no empty required fields, prices are numbers, URLs are valid)\nStandardize formats (dates to ISO 8601, currency to numbers, consistent units)\nGenerate data quality report:\nData Quality Report\n───────────────────\nTotal records: 2,487\nDuplicates removed: 13\nEmpty fields filled: 0\nFields with issues: price (3 records had non-numeric values — cleaned)\nCompleteness: 99.5%"
      },
      {
        "title": "Step 4: Client Deliverable Package",
        "body": "Generate a complete deliverable:\n\ndelivery/\n  data.csv                    # Clean data in requested format\n  data.json                   # JSON alternative\n  data-quality-report.md      # Quality metrics\n  scraper-documentation.md    # How the scraper works\n  README.md                   # Quick start guide\n\nscraper-documentation.md includes:\n\nWhat was scraped and from where\nHow many records collected\nData fields and their descriptions\nHow to re-run the scraper\nKnown limitations\nDate of scraping"
      },
      {
        "title": "Step 5: Output to User",
        "body": "Present:\n\nSummary: X records scraped from Y pages, Z% data quality\nSample data: First 5 rows of the output\nFile locations: Where the deliverables are saved\nClient handoff notes: What to tell the client about the data"
      },
      {
        "title": "Scraper Templates",
        "body": "Based on the target type, use the appropriate template:"
      },
      {
        "title": "E-commerce Product Scraper",
        "body": "Fields: name, price, original_price, discount, description, images, category, sku, rating, review_count, availability, url"
      },
      {
        "title": "Real Estate Listings",
        "body": "Fields: address, price, bedrooms, bathrooms, sqft, lot_size, listing_type, agent, description, images, url"
      },
      {
        "title": "Job Listings",
        "body": "Fields: title, company, location, salary, job_type, description, requirements, posted_date, url"
      },
      {
        "title": "Directory/Business Listings",
        "body": "Fields: business_name, address, phone, website, category, rating, review_count, hours, description"
      },
      {
        "title": "News/Blog Articles",
        "body": "Fields: title, author, date, content, tags, url, image"
      },
      {
        "title": "Ethical Scraping Rules",
        "body": "Always respect robots.txt — check before scraping\nRate limit — minimum 2 second delay between requests\nIdentify yourself — use realistic but honest User-Agent\nDon't scrape personal data (emails, phone numbers) unless explicitly authorized by the client AND the data is publicly displayed\nCache responses — don't re-scrape pages unnecessarily\nCheck ToS — note if the site's terms prohibit scraping and inform the client"
      }
    ],
    "body": "Web Scraper as a Service\n\nTurn scraping briefs into deliverable scraping projects. Generates the scraper, runs it, cleans the data, and packages everything for the client.\n\nHow to Use\n/web-scraper-as-a-service \"Scrape all products from example-store.com — need name, price, description, images. CSV output.\"\n/web-scraper-as-a-service https://example.com --fields \"title,price,rating,url\" --format csv\n/web-scraper-as-a-service brief.txt\n\nScraper Generation Pipeline\nStep 1: Analyze the Target\n\nBefore writing any code:\n\nFetch the target URL to understand the page structure\nIdentify:\nIs the site server-rendered (static HTML) or client-rendered (JavaScript/SPA)?\nWhat anti-scraping measures are visible? (Cloudflare, CAPTCHAs, rate limits)\nPagination pattern (URL params, infinite scroll, load more button)\nData structure (product cards, table rows, list items)\nTotal estimated volume (number of pages/items)\nChoose the right tool:\nStatic HTML → Python + requests + BeautifulSoup\nJavaScript-rendered → Python + playwright\nAPI available → Direct API calls (check network tab patterns)\nStep 2: Build the Scraper\n\nGenerate a complete Python script in scraper/ directory:\n\nscraper/\n  scrape.py           # Main scraper script\n  requirements.txt    # Dependencies\n  config.json         # Target URLs, fields, settings\n  README.md           # Setup and usage instructions for client\n\n\nscrape.py must include:\n\n# Required features in every scraper:\n\n# 1. Configuration\nimport json\nconfig = json.load(open('config.json'))\n\n# 2. Rate limiting (ALWAYS — be respectful)\nimport time\nDELAY_BETWEEN_REQUESTS = 2  # seconds, adjustable in config\n\n# 3. Retry logic\nMAX_RETRIES = 3\nRETRY_DELAY = 5\n\n# 4. User-Agent rotation\nUSER_AGENTS = [\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...\",\n    # ... at least 5 user agents\n]\n\n# 5. Progress tracking\nprint(f\"Scraping page {current}/{total} — {items_collected} items collected\")\n\n# 6. Error handling\n# - Log errors but don't crash on individual page failures\n# - Save progress incrementally (don't lose data on crash)\n# - Write errors to error_log.txt\n\n# 7. Output\n# - Save data incrementally (append to file, don't hold in memory)\n# - Support CSV and JSON output\n# - Clean and normalize data before saving\n\n# 8. Resume capability\n# - Track last successfully scraped page/URL\n# - Can resume from where it left off if interrupted\n\nStep 3: Data Cleaning\n\nAfter scraping, clean the data:\n\nRemove duplicates (by unique identifier or composite key)\nNormalize text (strip extra whitespace, fix encoding issues, consistent capitalization)\nValidate data (no empty required fields, prices are numbers, URLs are valid)\nStandardize formats (dates to ISO 8601, currency to numbers, consistent units)\nGenerate data quality report:\nData Quality Report\n───────────────────\nTotal records: 2,487\nDuplicates removed: 13\nEmpty fields filled: 0\nFields with issues: price (3 records had non-numeric values — cleaned)\nCompleteness: 99.5%\n\nStep 4: Client Deliverable Package\n\nGenerate a complete deliverable:\n\ndelivery/\n  data.csv                    # Clean data in requested format\n  data.json                   # JSON alternative\n  data-quality-report.md      # Quality metrics\n  scraper-documentation.md    # How the scraper works\n  README.md                   # Quick start guide\n\n\nscraper-documentation.md includes:\n\nWhat was scraped and from where\nHow many records collected\nData fields and their descriptions\nHow to re-run the scraper\nKnown limitations\nDate of scraping\nStep 5: Output to User\n\nPresent:\n\nSummary: X records scraped from Y pages, Z% data quality\nSample data: First 5 rows of the output\nFile locations: Where the deliverables are saved\nClient handoff notes: What to tell the client about the data\nScraper Templates\n\nBased on the target type, use the appropriate template:\n\nE-commerce Product Scraper\n\nFields: name, price, original_price, discount, description, images, category, sku, rating, review_count, availability, url\n\nReal Estate Listings\n\nFields: address, price, bedrooms, bathrooms, sqft, lot_size, listing_type, agent, description, images, url\n\nJob Listings\n\nFields: title, company, location, salary, job_type, description, requirements, posted_date, url\n\nDirectory/Business Listings\n\nFields: business_name, address, phone, website, category, rating, review_count, hours, description\n\nNews/Blog Articles\n\nFields: title, author, date, content, tags, url, image\n\nEthical Scraping Rules\nAlways respect robots.txt — check before scraping\nRate limit — minimum 2 second delay between requests\nIdentify yourself — use realistic but honest User-Agent\nDon't scrape personal data (emails, phone numbers) unless explicitly authorized by the client AND the data is publicly displayed\nCache responses — don't re-scrape pages unnecessarily\nCheck ToS — note if the site's terms prohibit scraping and inform the client"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/seanwyngaard/web-scraper-as-a-service",
    "publisherUrl": "https://clawhub.ai/seanwyngaard/web-scraper-as-a-service",
    "owner": "seanwyngaard",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service",
    "downloadUrl": "https://openagent3.xyz/downloads/web-scraper-as-a-service",
    "agentUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent",
    "manifestUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/web-scraper-as-a-service/agent.md"
  }
}