{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-web-scraping-engine",
    "name": "Web Scraping & Data Extraction Engine",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-web-scraping-engine",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-web-scraping-engine",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-web-scraping-engine",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-web-scraping-engine",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-05-07T17:22:31.273Z",
      "expiresAt": "2026-05-14T17:22:31.273Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-annual-report",
        "contentDisposition": "attachment; filename=\"afrexai-annual-report-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-web-scraping-engine"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-web-scraping-engine",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Quick Health Check (Run First)",
        "body": "Score your scraping operation (2 points each):\n\nSignalHealthyUnhealthyLegal compliancerobots.txt checked, ToS reviewedScraping blindlyArchitectureTool matches site complexityUsing Puppeteer for static HTMLAnti-detectionRotation, delays, fingerprint diversitySingle IP, no delaysData qualityValidation + dedup pipelineRaw dumps, no cleaningError handlingRetry logic, circuit breakersCrashes on first 403MonitoringSuccess rates tracked, alerts setNo visibilityStorageStructured, deduplicated, versionedFlat files, duplicatesSchedulingAppropriate frequency, off-peakHammering during business hours\n\nScore: /16 → 12+: Production-ready | 8-11: Needs work | <8: Stop and redesign"
      },
      {
        "title": "Pre-Scrape Compliance Checklist",
        "body": "compliance_brief:\n  target_domain: \"\"\n  date_assessed: \"\"\n  \n  robots_txt:\n    checked: false\n    target_paths_allowed: false\n    crawl_delay_specified: \"\"\n    ai_bot_rules: \"\"  # Many sites now block AI crawlers specifically\n    \n  terms_of_service:\n    reviewed: false\n    scraping_mentioned: false\n    scraping_prohibited: false\n    api_available: false\n    api_sufficient: false\n    \n  data_classification:\n    type: \"\"  # public-factual | public-personal | behind-auth | copyrighted\n    contains_pii: false\n    pii_types: []  # name, email, phone, address, photo\n    gdpr_applies: false  # EU residents' data\n    ccpa_applies: false  # California residents' data\n    \n  legal_risk: \"\"  # low | medium | high | do-not-scrape\n  decision: \"\"  # proceed | use-api | request-permission | abandon\n  justification: \"\""
      },
      {
        "title": "Legal Landscape Quick Reference",
        "body": "ScenarioRisk LevelKey Case LawPublic data, no login, robots.txt allowsLOWhiQ v. LinkedIn (2022)Public data, robots.txt disallowsMEDIUMMeta v. Bright Data (2024)Behind authenticationHIGHVan Buren v. US (2021), CFAAPersonal data without consentHIGHGDPR Art. 6, CCPA §1798.100Republishing copyrighted contentHIGHCopyright Act §106Price/product comparisonLOWeBay v. Bidder's Edge (fair use)Academic/research useLOW-MEDIUMVaries by jurisdictionBypassing anti-bot measuresHIGHCFAA \"exceeds authorized access\""
      },
      {
        "title": "Decision Rules",
        "body": "API exists and covers your needs? → Use the API. Always.\nrobots.txt disallows your target? → Respect it unless you have written permission.\nData behind login? → Do not scrape without explicit authorization.\nContains PII? → GDPR/CCPA compliance required before collection.\nCopyrighted content? → Extract facts/data points only, never full content.\nSite explicitly prohibits scraping? → Request permission or find alternative source."
      },
      {
        "title": "AI Crawler Considerations (2025+)",
        "body": "Many sites now specifically block AI-related crawlers:\n\n# Common AI bot blocks in robots.txt\nUser-agent: GPTBot\nUser-agent: ChatGPT-User\nUser-agent: Google-Extended\nUser-agent: CCBot\nUser-agent: anthropic-ai\nUser-agent: ClaudeBot\nUser-agent: Bytespider\nUser-agent: PerplexityBot\n\nRule: If collecting data for AI training, check for these specific blocks."
      },
      {
        "title": "Tool Selection Matrix",
        "body": "Tool/ApproachBest ForSpeedJS SupportComplexityCostHTTP client (requests/axios)Static HTML, APIs⚡⚡⚡❌LowFreeBeautiful Soup / CheerioStatic HTML parsing⚡⚡⚡❌LowFreeScrapyLarge-scale structured crawling⚡⚡⚡PluginMediumFreePlaywright / PuppeteerJS-rendered, SPAs, interactions⚡✅MediumFreeSeleniumLegacy, browser automation⚡✅HighFreeCrawleeHybrid (HTTP + browser fallback)⚡⚡✅MediumFreeFirecrawl / ScrapingBeeManaged, anti-bot bypass⚡⚡✅LowPaidBright Data / OxylabsEnterprise, proxy + browser⚡⚡✅LowPaid"
      },
      {
        "title": "Decision Tree",
        "body": "Is the content in the initial HTML source?\n├── YES → Is the site structure consistent?\n│   ├── YES → Static scraper (requests + BeautifulSoup/Cheerio)\n│   └── NO → Scrapy with custom parsers\n└── NO → Does the page require user interaction?\n    ├── YES → Playwright/Puppeteer with interaction scripts\n    └── NO → Playwright in non-interactive mode\n        └── At scale (>10K pages)? → Crawlee (hybrid mode)\n            └── Heavy anti-bot? → Managed service (Firecrawl/ScrapingBee)"
      },
      {
        "title": "Architecture Brief YAML",
        "body": "scraping_project:\n  name: \"\"\n  objective: \"\"  # What data, why, how often\n  \n  targets:\n    - domain: \"\"\n      pages_estimated: 0\n      rendering: \"static\" | \"javascript\" | \"spa\"\n      anti_bot: \"none\" | \"basic\" | \"cloudflare\" | \"advanced\"\n      rate_limit: \"\"  # requests per second safe limit\n      \n  tool_selected: \"\"\n  justification: \"\"\n  \n  data_schema:\n    fields: []\n    output_format: \"\"  # json | csv | database\n    \n  schedule:\n    frequency: \"\"  # once | hourly | daily | weekly\n    preferred_time: \"\"  # off-peak for target timezone\n    \n  infrastructure:\n    proxy_needed: false\n    proxy_type: \"\"  # residential | datacenter | mobile\n    storage: \"\"\n    monitoring: \"\""
      },
      {
        "title": "HTTP Request Best Practices",
        "body": "# Python example — production request pattern\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\n\nsession = requests.Session()\n\n# Retry strategy\nretry = Retry(\n    total=3,\n    backoff_factor=1,      # 1s, 2s, 4s\n    status_forcelist=[429, 500, 502, 503, 504],\n    respect_retry_after_header=True\n)\nsession.mount(\"https://\", HTTPAdapter(max_retries=retry))\n\n# Realistic headers\nsession.headers.update({\n    \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36\",\n    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\",\n    \"Accept-Language\": \"en-US,en;q=0.9\",\n    \"Accept-Encoding\": \"gzip, deflate, br\",\n    \"Connection\": \"keep-alive\",\n    \"Cache-Control\": \"no-cache\",\n})"
      },
      {
        "title": "Header Rotation Strategy",
        "body": "Rotate these to avoid fingerprinting:\n\nHeaderRotation Pool SizeNotesUser-Agent20-50 real browser UAsMatch OS distributionAccept-Language5-10 locale combosMatch proxy geoSec-Ch-UaMatch User-AgentChrome/Edge/BraveRefererVary per requestPrevious page or search engine"
      },
      {
        "title": "Rate Limiting Rules",
        "body": "Site TypeSafe DelayAggressive (risky)Small business site5-10 seconds2-3 secondsMedium site2-5 seconds1-2 secondsLarge platform (Amazon, etc.)3-5 seconds1 secondAPI endpointPer API docsNever exceedrobots.txt crawl-delayRespect exactlyNever below\n\nRules:\n\nAlways respect Crawl-delay in robots.txt\nAdd random jitter (±30%) to avoid pattern detection\nSlow down during business hours for smaller sites\nRespect Retry-After headers — they mean it\nWatch for 429s — back off exponentially (2x each time)"
      },
      {
        "title": "CSS Selector Strategy (Priority Order)",
        "body": "Data attributes → [data-product-id], [data-price] (most stable)\nSemantic IDs → #product-title, #price (stable but can change)\nARIA attributes → [aria-label=\"Price\"] (accessibility, fairly stable)\nSemantic HTML → article, main, nav (structural, stable)\nClass names → .product-card (can change with redesigns)\nXPath position → //div[3]/span[2] (FRAGILE — last resort)"
      },
      {
        "title": "Extraction Patterns",
        "body": "Structured data first — Check before writing CSS selectors:\n\n# 1. Check JSON-LD (best source — structured, clean)\nimport json\nfrom bs4 import BeautifulSoup\n\nsoup = BeautifulSoup(html, 'html.parser')\nfor script in soup.find_all('script', type='application/ld+json'):\n    data = json.loads(script.string)\n    # Often contains: Product, Article, Organization, etc.\n\n# 2. Check Open Graph meta tags\nog_title = soup.find('meta', property='og:title')\nog_price = soup.find('meta', property='product:price:amount')\n\n# 3. Check microdata\nitems = soup.find_all(itemtype=True)\n\n# 4. Fall back to CSS selectors only if above are empty\n\nTable extraction pattern:\n\nimport pandas as pd\n\n# Quick table extraction\ntables = pd.read_html(html)  # Returns list of DataFrames\n\n# For complex tables with merged cells\ndef extract_table(soup, selector):\n    table = soup.select_one(selector)\n    headers = [th.get_text(strip=True) for th in table.select('thead th')]\n    rows = []\n    for tr in table.select('tbody tr'):\n        cells = [td.get_text(strip=True) for td in tr.select('td')]\n        rows.append(dict(zip(headers, cells)))\n    return rows\n\nPagination handling:\n\n# Pattern 1: Next button\nwhile True:\n    # ... scrape current page ...\n    next_link = soup.select_one('a.next-page, [rel=\"next\"], .pagination .next a')\n    if not next_link or not next_link.get('href'):\n        break\n    url = urljoin(base_url, next_link['href'])\n    \n# Pattern 2: API pagination (infinite scroll sites)\npage = 1\nwhile True:\n    resp = session.get(f\"{api_url}?page={page}&limit=50\")\n    data = resp.json()\n    if not data.get('results'):\n        break\n    # ... process results ...\n    page += 1\n\n# Pattern 3: Cursor-based\ncursor = None\nwhile True:\n    params = {\"limit\": 50}\n    if cursor:\n        params[\"cursor\"] = cursor\n    resp = session.get(api_url, params=params)\n    data = resp.json()\n    # ... process ...\n    cursor = data.get('next_cursor')\n    if not cursor:\n        break"
      },
      {
        "title": "JavaScript-Rendered Content",
        "body": "# Playwright pattern for JS-rendered pages\nfrom playwright.sync_api import sync_playwright\n\nwith sync_playwright() as p:\n    browser = p.chromium.launch(headless=True)\n    context = browser.new_context(\n        viewport={\"width\": 1920, \"height\": 1080},\n        user_agent=\"Mozilla/5.0 ...\",\n    )\n    page = context.new_page()\n    \n    # Block unnecessary resources (speed + stealth)\n    page.route(\"**/*.{png,jpg,jpeg,gif,svg,woff,woff2}\", \n               lambda route: route.abort())\n    \n    page.goto(url, wait_until=\"networkidle\")\n    \n    # Wait for specific content (better than arbitrary sleep)\n    page.wait_for_selector('[data-product-id]', timeout=10000)\n    \n    # Extract after JS rendering\n    content = page.content()\n    # ... parse with BeautifulSoup/Cheerio ...\n    \n    browser.close()"
      },
      {
        "title": "Detection Signals (What Sites Check)",
        "body": "SignalDetection MethodMitigationIP reputationIP blacklists, datacenter rangesResidential proxiesRequest rateRequests/min from same IPRate limiting + jitterTLS fingerprintJA3/JA4 hash matchingUse real browser or curl-impersonateBrowser fingerprintCanvas, WebGL, fontsPlaywright with stealth pluginJavaScript challengesCloudflare Turnstile, hCaptchaManaged browser servicesCookie/session behaviorMissing cookies, no historyFull session managementNavigation patternDirect URL hits, no referrerSimulate natural browsingMouse/keyboard eventsNo interaction telemetryEvent simulation (Playwright)Header consistencyMismatched headers vs UAHeader sets that match"
      },
      {
        "title": "Proxy Strategy",
        "body": "proxy_strategy:\n  # Tier 1: Free/Datacenter (for non-protected sites)\n  basic:\n    type: \"datacenter\"\n    cost: \"$1-5/GB\"\n    success_rate: \"60-80%\"\n    use_for: \"APIs, small sites, no anti-bot\"\n    \n  # Tier 2: Residential (for most protected sites)\n  standard:\n    type: \"residential\"\n    cost: \"$5-15/GB\"\n    success_rate: \"90-95%\"\n    use_for: \"Cloudflare, major platforms\"\n    rotation: \"per-request or sticky 10min\"\n    \n  # Tier 3: Mobile/ISP (for maximum stealth)\n  premium:\n    type: \"mobile\"\n    cost: \"$15-30/GB\"\n    success_rate: \"95-99%\"\n    use_for: \"Aggressive anti-bot, social media\"\n    \n  rules:\n    - Start with cheapest tier, escalate only on blocks\n    - Match proxy geo to target audience geo\n    - Rotate on 403/429, not every request\n    - Use sticky sessions for multi-page scrapes\n    - Monitor proxy health — remove slow/blocked IPs"
      },
      {
        "title": "Playwright Stealth Configuration",
        "body": "# Essential stealth for Playwright\nfrom playwright.sync_api import sync_playwright\n\nwith sync_playwright() as p:\n    browser = p.chromium.launch(\n        headless=True,\n        args=[\n            '--disable-blink-features=AutomationControlled',\n            '--disable-features=IsolateOrigins,site-per-process',\n        ]\n    )\n    context = browser.new_context(\n        viewport={\"width\": 1920, \"height\": 1080},\n        locale=\"en-US\",\n        timezone_id=\"America/New_York\",\n        geolocation={\"latitude\": 40.7128, \"longitude\": -74.0060},\n        permissions=[\"geolocation\"],\n    )\n    \n    # Remove automation indicators\n    page = context.new_page()\n    page.add_init_script(\"\"\"\n        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});\n        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]});\n    \"\"\")"
      },
      {
        "title": "Cloudflare Bypass Decision",
        "body": "Cloudflare detected?\n├── JS Challenge only → Playwright with stealth + residential proxy\n├── Turnstile CAPTCHA → Managed service (ScrapingBee/Bright Data)\n├── Under Attack Mode → Wait, try later, or managed service\n└── WAF blocking → Different approach needed\n    ├── Check for API endpoints (network tab)\n    ├── Check for mobile app API\n    └── Consider if data is available elsewhere"
      },
      {
        "title": "Data Validation Rules",
        "body": "# Validation pattern — validate BEFORE storing\nfrom dataclasses import dataclass, field\nfrom typing import Optional\nimport re\nfrom datetime import datetime\n\n@dataclass\nclass ScrapedProduct:\n    url: str\n    title: str\n    price: Optional[float]\n    currency: str = \"USD\"\n    scraped_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())\n    \n    def validate(self) -> list[str]:\n        errors = []\n        if not self.url.startswith('http'):\n            errors.append(\"Invalid URL\")\n        if not self.title or len(self.title) < 3:\n            errors.append(\"Title too short or missing\")\n        if self.price is not None and self.price < 0:\n            errors.append(\"Negative price\")\n        if self.price is not None and self.price > 1_000_000:\n            errors.append(\"Price suspiciously high — verify\")\n        if self.currency not in (\"USD\", \"EUR\", \"GBP\", \"BTC\"):\n            errors.append(f\"Unknown currency: {self.currency}\")\n        return errors"
      },
      {
        "title": "Deduplication Strategy",
        "body": "MethodWhen to UseImplementationURL-basedPages with unique URLsHash the canonical URLContent hashSame URL, changing contentMD5/SHA256 of key fieldsFuzzy matchingNear-duplicate detectionJaccard similarity > 0.85Composite keyMulti-field uniquenessHash(domain + product_id + variant)\n\nimport hashlib\n\ndef dedup_key(item: dict, fields: list[str]) -> str:\n    \"\"\"Generate dedup key from selected fields.\"\"\"\n    values = \"|\".join(str(item.get(f, \"\")) for f in fields)\n    return hashlib.sha256(values.encode()).hexdigest()\n\n# Usage\nseen = set()\nfor item in scraped_items:\n    key = dedup_key(item, [\"url\", \"product_id\"])\n    if key not in seen:\n        seen.add(key)\n        clean_items.append(item)"
      },
      {
        "title": "Data Cleaning Pipeline",
        "body": "Raw HTML → Parse → Extract → Validate → Clean → Deduplicate → Store\n                                ↓\n                          Quarantine (failed validation)\n\nCommon cleaning operations:\n\nProblemSolutionHTML entities (&amp;)html.unescape()Extra whitespace\" \".join(text.split())Unicode issuesunicodedata.normalize('NFKD', text)Price in text (\"$49.99\")Regex: r'[\\$£€]?([\\d,]+\\.?\\d*)'Date formats varydateutil.parser.parse() with dayfirst flagRelative URLsurllib.parse.urljoin(base, relative)Encoding issueschardet.detect() then decode"
      },
      {
        "title": "Storage Decision Guide",
        "body": "VolumeFrequencyQuery NeedsRecommendation<10K recordsOne-timeNoneJSON/CSV files<10K recordsRecurringSimple lookupsSQLite10K-1M recordsRecurringComplex queriesPostgreSQL1M+ recordsContinuousAnalyticsPostgreSQL + partitioningAppend-only logsContinuousTime-seriesClickHouse / TimescaleDB"
      },
      {
        "title": "SQLite Pattern (Most Common)",
        "body": "import sqlite3\nimport json\nfrom datetime import datetime\n\ndef init_db(path=\"scraper_data.db\"):\n    conn = sqlite3.connect(path)\n    conn.execute(\"\"\"\n        CREATE TABLE IF NOT EXISTS items (\n            id INTEGER PRIMARY KEY,\n            url TEXT UNIQUE,\n            data JSON NOT NULL,\n            scraped_at TEXT DEFAULT (datetime('now')),\n            updated_at TEXT,\n            checksum TEXT\n        )\n    \"\"\")\n    conn.execute(\"CREATE INDEX IF NOT EXISTS idx_url ON items(url)\")\n    conn.execute(\"CREATE INDEX IF NOT EXISTS idx_scraped ON items(scraped_at)\")\n    return conn\n\ndef upsert(conn, url, data, checksum):\n    conn.execute(\"\"\"\n        INSERT INTO items (url, data, checksum) VALUES (?, ?, ?)\n        ON CONFLICT(url) DO UPDATE SET\n            data = excluded.data,\n            updated_at = datetime('now'),\n            checksum = excluded.checksum\n        WHERE items.checksum != excluded.checksum\n    \"\"\", (url, json.dumps(data), checksum))\n    conn.commit()"
      },
      {
        "title": "Export Formats",
        "body": "# CSV export\nimport csv\ndef to_csv(items, path, fields):\n    with open(path, 'w', newline='') as f:\n        writer = csv.DictWriter(f, fieldnames=fields)\n        writer.writeheader()\n        writer.writerows(items)\n\n# JSON Lines (best for large datasets — streaming)\ndef to_jsonl(items, path):\n    with open(path, 'w') as f:\n        for item in items:\n            f.write(json.dumps(item) + '\\n')\n\n# Incremental export (only new/changed since last export)\ndef export_since(conn, last_export_time):\n    cursor = conn.execute(\n        \"SELECT data FROM items WHERE scraped_at > ? OR updated_at > ?\",\n        (last_export_time, last_export_time)\n    )\n    return [json.loads(row[0]) for row in cursor]"
      },
      {
        "title": "Error Classification",
        "body": "HTTP CodeMeaningAction200SuccessProcess normally301/302RedirectFollow (max 5 hops)403Forbidden/blockedRotate proxy, slow down404Not foundLog, skip, mark URL dead429Rate limitedRespect Retry-After, back off 2x500-504Server errorRetry 3x with backoffConnection timeoutNetwork issueRetry with different proxySSL errorCertificate issueLog, investigate, skip"
      },
      {
        "title": "Circuit Breaker Pattern",
        "body": "class CircuitBreaker:\n    def __init__(self, failure_threshold=5, reset_timeout=300):\n        self.failures = 0\n        self.threshold = failure_threshold\n        self.reset_timeout = reset_timeout\n        self.last_failure = 0\n        self.state = \"closed\"  # closed | open | half-open\n    \n    def record_failure(self):\n        self.failures += 1\n        self.last_failure = time.time()\n        if self.failures >= self.threshold:\n            self.state = \"open\"\n            # Alert: \"Circuit open — too many failures\"\n    \n    def record_success(self):\n        self.failures = 0\n        self.state = \"closed\"\n    \n    def can_proceed(self):\n        if self.state == \"closed\":\n            return True\n        if self.state == \"open\":\n            if time.time() - self.last_failure > self.reset_timeout:\n                self.state = \"half-open\"\n                return True  # Try one request\n            return False\n        return True  # half-open: allow attempt"
      },
      {
        "title": "Checkpoint & Resume",
        "body": "import json\nfrom pathlib import Path\n\nclass Checkpointer:\n    def __init__(self, path=\"checkpoint.json\"):\n        self.path = Path(path)\n        self.state = self._load()\n    \n    def _load(self):\n        if self.path.exists():\n            return json.loads(self.path.read_text())\n        return {\"completed_urls\": [], \"last_page\": 0, \"cursor\": None}\n    \n    def save(self):\n        self.path.write_text(json.dumps(self.state))\n    \n    def is_done(self, url):\n        return url in self.state[\"completed_urls\"]\n    \n    def mark_done(self, url):\n        self.state[\"completed_urls\"].append(url)\n        if len(self.state[\"completed_urls\"]) % 50 == 0:\n            self.save()  # Periodic save"
      },
      {
        "title": "Scraper Health Dashboard",
        "body": "dashboard:\n  real_time:\n    - metric: \"requests_per_minute\"\n      alert_if: \"> 60 for small sites\"\n    - metric: \"success_rate\"\n      alert_if: \"< 90%\"\n    - metric: \"avg_response_time_ms\"\n      alert_if: \"> 5000\"\n    - metric: \"blocked_rate\"\n      alert_if: \"> 10%\"\n      \n  per_run:\n    - metric: \"pages_scraped\"\n    - metric: \"items_extracted\"\n    - metric: \"items_validated\"\n    - metric: \"items_deduplicated\"\n    - metric: \"new_items\"\n    - metric: \"updated_items\"\n    - metric: \"errors_by_type\"\n    - metric: \"run_duration\"\n    - metric: \"proxy_cost\"\n    \n  weekly:\n    - metric: \"data_freshness\"\n      description: \"% of records updated in last 7 days\"\n    - metric: \"site_structure_changes\"\n      description: \"Selectors that stopped matching\"\n    - metric: \"total_cost\"\n      description: \"Proxy + compute + storage\""
      },
      {
        "title": "Breakage Detection",
        "body": "Sites redesign. Selectors break. Detect it early:\n\ndef health_check(results: list[dict], expected_fields: list[str]) -> dict:\n    \"\"\"Check if scraper is still extracting correctly.\"\"\"\n    total = len(results)\n    if total == 0:\n        return {\"status\": \"CRITICAL\", \"message\": \"Zero results — likely broken\"}\n    \n    field_coverage = {}\n    for field in expected_fields:\n        filled = sum(1 for r in results if r.get(field))\n        coverage = filled / total\n        field_coverage[field] = coverage\n        \n    issues = []\n    for field, coverage in field_coverage.items():\n        if coverage < 0.5:\n            issues.append(f\"{field}: {coverage:.0%} fill rate (expected >50%)\")\n    \n    if issues:\n        return {\"status\": \"WARNING\", \"issues\": issues}\n    return {\"status\": \"OK\", \"field_coverage\": field_coverage}"
      },
      {
        "title": "Operational Runbook",
        "body": "Daily:\n\nCheck success rate per target domain\nReview error logs for new patterns\nVerify data freshness\n\nWeekly:\n\nCompare extraction counts vs baseline (>20% drop = investigate)\nReview proxy spend\nSpot-check 10 random records for accuracy\n\nMonthly:\n\nFull selector validation against live pages\nReview legal compliance (robots.txt changes, ToS updates)\nCost optimization review\nPrune dead URLs from queue"
      },
      {
        "title": "Pattern 1: E-commerce Price Monitor",
        "body": "use_case: \"Track competitor prices daily\"\ntool: \"requests + BeautifulSoup\"\nschedule: \"Daily at 03:00 UTC (off-peak)\"\ntargets: [\"competitor-a.com/products\", \"competitor-b.com/api\"]\ndata:\n  - product_id\n  - product_name\n  - price\n  - currency\n  - in_stock\n  - scraped_at\nstorage: \"SQLite with price history\"\nalerts: \"Price change > 10% → notify\""
      },
      {
        "title": "Pattern 2: Job Board Aggregator",
        "body": "use_case: \"Aggregate job listings from multiple boards\"\ntool: \"Scrapy with per-site spiders\"\nschedule: \"Every 6 hours\"\ntargets: [\"board-a.com\", \"board-b.com\", \"board-c.com\"]\ndata:\n  - title\n  - company\n  - location\n  - salary_range\n  - posted_date\n  - url\n  - source\ndedup: \"Hash(title + company + location)\"\nstorage: \"PostgreSQL\""
      },
      {
        "title": "Pattern 3: News & Content Monitor",
        "body": "use_case: \"Monitor industry news mentions\"\ntool: \"requests + RSS feeds (preferred) + web fallback\"\nschedule: \"Every 30 minutes\"\napproach:\n  1: \"RSS/Atom feeds (fastest, cleanest)\"\n  2: \"Google News RSS for topic\"\n  3: \"Direct scraping if no feed\"\ndata:\n  - headline\n  - source\n  - url\n  - published_at\n  - snippet\n  - sentiment\nalerts: \"Keyword match → immediate notification\""
      },
      {
        "title": "Pattern 4: Social Media Intelligence",
        "body": "use_case: \"Track brand mentions and sentiment\"\ntool: \"Official APIs (always) + web search fallback\"\nrules:\n  - NEVER scrape social platforms directly — use APIs\n  - Twitter/X: Official API ($100/mo basic)\n  - Reddit: Official API (free tier available)\n  - LinkedIn: No scraping (aggressive legal action)\n  - Instagram: Official API only (Meta Business)\nfallback: \"Brave/Google search for public mentions\""
      },
      {
        "title": "Pattern 5: Real Estate Listings",
        "body": "use_case: \"Track property listings and prices\"\ntool: \"Playwright (most listing sites are JS-heavy)\"\nschedule: \"Daily\"\nchallenges:\n  - Heavy JavaScript rendering\n  - Anti-bot measures (Cloudflare common)\n  - Frequent layout changes\n  - Map-based results\napproach: \"API endpoint discovery via network tab first\""
      },
      {
        "title": "Concurrency Architecture",
        "body": "Single machine (small scale):\n├── asyncio + aiohttp (Python) → 50-200 concurrent requests\n├── Worker pool (ThreadPoolExecutor) → 10-50 threads\n└── Scrapy reactor → Built-in concurrency\n\nMulti-machine (large scale):\n├── URL queue: Redis / RabbitMQ / SQS\n├── Workers: Multiple Scrapy/custom workers\n├── Results: Shared PostgreSQL / S3\n└── Coordinator: Celery / custom scheduler"
      },
      {
        "title": "Cost Optimization",
        "body": "LeverImpactHowStatic > Browser10-50x cheaperAlways try HTTP firstBlock images/CSS/fonts60-80% bandwidth savedRoute filteringCache DNSMinor but cumulativeLocal DNS cacheCompress responses50-70% bandwidthAccept-Encoding: gzip, brSmart schedulingAvoid redundant scrapesChange detection before full re-scrapeProxy tier matching3-10x cost differenceDon't use residential for easy sites"
      },
      {
        "title": "API Discovery (Network Tab Mining)",
        "body": "Before building a scraper, check if the site has hidden API endpoints:\n\nOpen DevTools → Network tab\nFilter by XHR/Fetch\nNavigate the site, click load-more, filter/sort\nLook for JSON responses — these are your goldmine\nMost SPAs load data via REST/GraphQL APIs\n\nCommon hidden API patterns:\n\n/api/v1/products?page=1&limit=20\n/graphql with query parameters\n/_next/data/... (Next.js data routes)\n/wp-json/wp/v2/posts (WordPress)"
      },
      {
        "title": "Headless Browser Optimization",
        "body": "# Minimize browser resource usage\ncontext = browser.new_context(\n    viewport={\"width\": 1280, \"height\": 720},\n    java_script_enabled=True,  # Only if needed\n    has_touch=False,\n    is_mobile=False,\n)\n\n# Block resource types you don't need\npage.route(\"**/*\", lambda route: (\n    route.abort() if route.request.resource_type in \n    [\"image\", \"stylesheet\", \"font\", \"media\"] \n    else route.continue_()\n))"
      },
      {
        "title": "Scraping Behind Authentication",
        "body": "# When authorized to scrape behind login\n# ALWAYS use session-based auth, never store passwords in code\n\n# Pattern: Login once, reuse session\nsession = requests.Session()\nlogin_resp = session.post(\"https://example.com/login\", data={\n    \"username\": os.environ[\"SCRAPE_USER\"],\n    \"password\": os.environ[\"SCRAPE_PASS\"],\n})\nassert login_resp.ok, \"Login failed\"\n\n# Session cookies are now stored — use for subsequent requests\ndata_resp = session.get(\"https://example.com/api/data\")"
      },
      {
        "title": "Change Detection (Avoid Redundant Scrapes)",
        "body": "import hashlib\n\ndef has_changed(url, session, last_etag=None, last_modified=None):\n    \"\"\"Check if page changed without downloading full content.\"\"\"\n    headers = {}\n    if last_etag:\n        headers[\"If-None-Match\"] = last_etag\n    if last_modified:\n        headers[\"If-Modified-Since\"] = last_modified\n    \n    resp = session.head(url, headers=headers)\n    \n    if resp.status_code == 304:\n        return False, resp.headers.get(\"ETag\"), resp.headers.get(\"Last-Modified\")\n    \n    return True, resp.headers.get(\"ETag\"), resp.headers.get(\"Last-Modified\")"
      },
      {
        "title": "Quality Scoring Rubric (0-100)",
        "body": "DimensionWeightWhat to AssessLegal compliance20%robots.txt, ToS, PII handling, audit trailData quality20%Validation, accuracy, completeness, freshnessResilience15%Error handling, retries, circuit breakers, checkpointingAnti-detection15%Proxy rotation, fingerprint diversity, rate limitingArchitecture10%Right tool selection, clean code, modularityMonitoring10%Success rates, breakage detection, alertingPerformance5%Speed, cost efficiency, resource usageDocumentation5%Runbook, schema docs, legal assessment\n\nGrading: 90+ Excellent | 75-89 Good | 60-74 Needs work | <60 Redesign"
      },
      {
        "title": "10 Common Mistakes",
        "body": "#MistakeFix1No robots.txt checkAlways check first — it's your legal defense2Fixed delays (no jitter)Add ±30% random jitter to all delays3No data validationValidate every field before storing4Using browser for static HTMLHTTP client is 10-50x faster and cheaper5Single IP, no rotationProxy rotation for any serious scraping6No breakage detectionMonitor extraction counts and field fill rates7Storing raw HTML onlyExtract + structure immediately8No checkpoint/resumeLong scrapes must be resumable9Ignoring structured dataJSON-LD/microdata is cleaner than CSS selectors10Scraping when API existsAlways check for API first"
      },
      {
        "title": "5 Edge Cases",
        "body": "Single-page apps (React/Vue/Angular): Must use browser rendering OR find the underlying API (network tab). Prefer API discovery — it's faster and more reliable.\n\n\nInfinite scroll: Intercept the XHR/fetch calls that load more content. Simulate scrolling only as last resort. The API endpoint usually accepts page or offset params.\n\n\nCAPTCHAs: If you're hitting CAPTCHAs, you're scraping too aggressively. Slow down first. If CAPTCHAs persist: managed services (2Captcha, Anti-Captcha) or rethink approach.\n\n\nDynamic class names (CSS modules, Tailwind): Use data attributes, ARIA labels, or text content selectors instead. [data-testid=\"price\"] survives redesigns. .sc-bdVTJa does not.\n\n\nMulti-language sites: Detect language via html[lang] attribute. Set Accept-Language header to get desired locale. Watch for different URL structures (/en/, /de/, subdomains)."
      },
      {
        "title": "Natural Language Commands",
        "body": "\"Check if I can scrape [URL]\" → Run compliance checklist (robots.txt, ToS, data type)\n\"What tool should I use for [site]?\" → Analyze site rendering, anti-bot, recommend tool\n\"Build a scraper for [description]\" → Full architecture brief + code pattern\n\"My scraper is getting blocked\" → Anti-detection diagnostic + proxy/stealth recommendations\n\"Extract [data] from [URL]\" → Check structured data first, then CSS selectors\n\"Monitor [site] for changes\" → Change detection + scheduling + alerting setup\n\"How do I handle pagination on [site]?\" → Identify pagination type + code pattern\n\"Scrape at scale ([N] pages)\" → Concurrency architecture + cost estimate\n\"Clean and store this scraped data\" → Validation + dedup + storage recommendation\n\"Is my scraper healthy?\" → Run health check + breakage detection\n\"Find the API behind [site]\" → Network tab mining guide + common patterns\n\"Set up price monitoring for [competitors]\" → Full e-commerce monitor pattern"
      }
    ],
    "body": "Web Scraping & Data Extraction Engine\nQuick Health Check (Run First)\n\nScore your scraping operation (2 points each):\n\nSignal\tHealthy\tUnhealthy\nLegal compliance\trobots.txt checked, ToS reviewed\tScraping blindly\nArchitecture\tTool matches site complexity\tUsing Puppeteer for static HTML\nAnti-detection\tRotation, delays, fingerprint diversity\tSingle IP, no delays\nData quality\tValidation + dedup pipeline\tRaw dumps, no cleaning\nError handling\tRetry logic, circuit breakers\tCrashes on first 403\nMonitoring\tSuccess rates tracked, alerts set\tNo visibility\nStorage\tStructured, deduplicated, versioned\tFlat files, duplicates\nScheduling\tAppropriate frequency, off-peak\tHammering during business hours\n\nScore: /16 → 12+: Production-ready | 8-11: Needs work | <8: Stop and redesign\n\nPhase 1: Legal & Ethical Foundation\nPre-Scrape Compliance Checklist\ncompliance_brief:\n  target_domain: \"\"\n  date_assessed: \"\"\n  \n  robots_txt:\n    checked: false\n    target_paths_allowed: false\n    crawl_delay_specified: \"\"\n    ai_bot_rules: \"\"  # Many sites now block AI crawlers specifically\n    \n  terms_of_service:\n    reviewed: false\n    scraping_mentioned: false\n    scraping_prohibited: false\n    api_available: false\n    api_sufficient: false\n    \n  data_classification:\n    type: \"\"  # public-factual | public-personal | behind-auth | copyrighted\n    contains_pii: false\n    pii_types: []  # name, email, phone, address, photo\n    gdpr_applies: false  # EU residents' data\n    ccpa_applies: false  # California residents' data\n    \n  legal_risk: \"\"  # low | medium | high | do-not-scrape\n  decision: \"\"  # proceed | use-api | request-permission | abandon\n  justification: \"\"\n\nLegal Landscape Quick Reference\nScenario\tRisk Level\tKey Case Law\nPublic data, no login, robots.txt allows\tLOW\thiQ v. LinkedIn (2022)\nPublic data, robots.txt disallows\tMEDIUM\tMeta v. Bright Data (2024)\nBehind authentication\tHIGH\tVan Buren v. US (2021), CFAA\nPersonal data without consent\tHIGH\tGDPR Art. 6, CCPA §1798.100\nRepublishing copyrighted content\tHIGH\tCopyright Act §106\nPrice/product comparison\tLOW\teBay v. Bidder's Edge (fair use)\nAcademic/research use\tLOW-MEDIUM\tVaries by jurisdiction\nBypassing anti-bot measures\tHIGH\tCFAA \"exceeds authorized access\"\nDecision Rules\nAPI exists and covers your needs? → Use the API. Always.\nrobots.txt disallows your target? → Respect it unless you have written permission.\nData behind login? → Do not scrape without explicit authorization.\nContains PII? → GDPR/CCPA compliance required before collection.\nCopyrighted content? → Extract facts/data points only, never full content.\nSite explicitly prohibits scraping? → Request permission or find alternative source.\nAI Crawler Considerations (2025+)\n\nMany sites now specifically block AI-related crawlers:\n\n# Common AI bot blocks in robots.txt\nUser-agent: GPTBot\nUser-agent: ChatGPT-User\nUser-agent: Google-Extended\nUser-agent: CCBot\nUser-agent: anthropic-ai\nUser-agent: ClaudeBot\nUser-agent: Bytespider\nUser-agent: PerplexityBot\n\n\nRule: If collecting data for AI training, check for these specific blocks.\n\nPhase 2: Architecture Decision\nTool Selection Matrix\nTool/Approach\tBest For\tSpeed\tJS Support\tComplexity\tCost\nHTTP client (requests/axios)\tStatic HTML, APIs\t⚡⚡⚡\t❌\tLow\tFree\nBeautiful Soup / Cheerio\tStatic HTML parsing\t⚡⚡⚡\t❌\tLow\tFree\nScrapy\tLarge-scale structured crawling\t⚡⚡⚡\tPlugin\tMedium\tFree\nPlaywright / Puppeteer\tJS-rendered, SPAs, interactions\t⚡\t✅\tMedium\tFree\nSelenium\tLegacy, browser automation\t⚡\t✅\tHigh\tFree\nCrawlee\tHybrid (HTTP + browser fallback)\t⚡⚡\t✅\tMedium\tFree\nFirecrawl / ScrapingBee\tManaged, anti-bot bypass\t⚡⚡\t✅\tLow\tPaid\nBright Data / Oxylabs\tEnterprise, proxy + browser\t⚡⚡\t✅\tLow\tPaid\nDecision Tree\nIs the content in the initial HTML source?\n├── YES → Is the site structure consistent?\n│   ├── YES → Static scraper (requests + BeautifulSoup/Cheerio)\n│   └── NO → Scrapy with custom parsers\n└── NO → Does the page require user interaction?\n    ├── YES → Playwright/Puppeteer with interaction scripts\n    └── NO → Playwright in non-interactive mode\n        └── At scale (>10K pages)? → Crawlee (hybrid mode)\n            └── Heavy anti-bot? → Managed service (Firecrawl/ScrapingBee)\n\nArchitecture Brief YAML\nscraping_project:\n  name: \"\"\n  objective: \"\"  # What data, why, how often\n  \n  targets:\n    - domain: \"\"\n      pages_estimated: 0\n      rendering: \"static\" | \"javascript\" | \"spa\"\n      anti_bot: \"none\" | \"basic\" | \"cloudflare\" | \"advanced\"\n      rate_limit: \"\"  # requests per second safe limit\n      \n  tool_selected: \"\"\n  justification: \"\"\n  \n  data_schema:\n    fields: []\n    output_format: \"\"  # json | csv | database\n    \n  schedule:\n    frequency: \"\"  # once | hourly | daily | weekly\n    preferred_time: \"\"  # off-peak for target timezone\n    \n  infrastructure:\n    proxy_needed: false\n    proxy_type: \"\"  # residential | datacenter | mobile\n    storage: \"\"\n    monitoring: \"\"\n\nPhase 3: Request Engineering\nHTTP Request Best Practices\n# Python example — production request pattern\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\n\nsession = requests.Session()\n\n# Retry strategy\nretry = Retry(\n    total=3,\n    backoff_factor=1,      # 1s, 2s, 4s\n    status_forcelist=[429, 500, 502, 503, 504],\n    respect_retry_after_header=True\n)\nsession.mount(\"https://\", HTTPAdapter(max_retries=retry))\n\n# Realistic headers\nsession.headers.update({\n    \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36\",\n    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\",\n    \"Accept-Language\": \"en-US,en;q=0.9\",\n    \"Accept-Encoding\": \"gzip, deflate, br\",\n    \"Connection\": \"keep-alive\",\n    \"Cache-Control\": \"no-cache\",\n})\n\nHeader Rotation Strategy\n\nRotate these to avoid fingerprinting:\n\nHeader\tRotation Pool Size\tNotes\nUser-Agent\t20-50 real browser UAs\tMatch OS distribution\nAccept-Language\t5-10 locale combos\tMatch proxy geo\nSec-Ch-Ua\tMatch User-Agent\tChrome/Edge/Brave\nReferer\tVary per request\tPrevious page or search engine\nRate Limiting Rules\nSite Type\tSafe Delay\tAggressive (risky)\nSmall business site\t5-10 seconds\t2-3 seconds\nMedium site\t2-5 seconds\t1-2 seconds\nLarge platform (Amazon, etc.)\t3-5 seconds\t1 second\nAPI endpoint\tPer API docs\tNever exceed\nrobots.txt crawl-delay\tRespect exactly\tNever below\n\nRules:\n\nAlways respect Crawl-delay in robots.txt\nAdd random jitter (±30%) to avoid pattern detection\nSlow down during business hours for smaller sites\nRespect Retry-After headers — they mean it\nWatch for 429s — back off exponentially (2x each time)\nPhase 4: Parsing & Extraction\nCSS Selector Strategy (Priority Order)\nData attributes → [data-product-id], [data-price] (most stable)\nSemantic IDs → #product-title, #price (stable but can change)\nARIA attributes → [aria-label=\"Price\"] (accessibility, fairly stable)\nSemantic HTML → article, main, nav (structural, stable)\nClass names → .product-card (can change with redesigns)\nXPath position → //div[3]/span[2] (FRAGILE — last resort)\nExtraction Patterns\n\nStructured data first — Check before writing CSS selectors:\n\n# 1. Check JSON-LD (best source — structured, clean)\nimport json\nfrom bs4 import BeautifulSoup\n\nsoup = BeautifulSoup(html, 'html.parser')\nfor script in soup.find_all('script', type='application/ld+json'):\n    data = json.loads(script.string)\n    # Often contains: Product, Article, Organization, etc.\n\n# 2. Check Open Graph meta tags\nog_title = soup.find('meta', property='og:title')\nog_price = soup.find('meta', property='product:price:amount')\n\n# 3. Check microdata\nitems = soup.find_all(itemtype=True)\n\n# 4. Fall back to CSS selectors only if above are empty\n\n\nTable extraction pattern:\n\nimport pandas as pd\n\n# Quick table extraction\ntables = pd.read_html(html)  # Returns list of DataFrames\n\n# For complex tables with merged cells\ndef extract_table(soup, selector):\n    table = soup.select_one(selector)\n    headers = [th.get_text(strip=True) for th in table.select('thead th')]\n    rows = []\n    for tr in table.select('tbody tr'):\n        cells = [td.get_text(strip=True) for td in tr.select('td')]\n        rows.append(dict(zip(headers, cells)))\n    return rows\n\n\nPagination handling:\n\n# Pattern 1: Next button\nwhile True:\n    # ... scrape current page ...\n    next_link = soup.select_one('a.next-page, [rel=\"next\"], .pagination .next a')\n    if not next_link or not next_link.get('href'):\n        break\n    url = urljoin(base_url, next_link['href'])\n    \n# Pattern 2: API pagination (infinite scroll sites)\npage = 1\nwhile True:\n    resp = session.get(f\"{api_url}?page={page}&limit=50\")\n    data = resp.json()\n    if not data.get('results'):\n        break\n    # ... process results ...\n    page += 1\n\n# Pattern 3: Cursor-based\ncursor = None\nwhile True:\n    params = {\"limit\": 50}\n    if cursor:\n        params[\"cursor\"] = cursor\n    resp = session.get(api_url, params=params)\n    data = resp.json()\n    # ... process ...\n    cursor = data.get('next_cursor')\n    if not cursor:\n        break\n\nJavaScript-Rendered Content\n# Playwright pattern for JS-rendered pages\nfrom playwright.sync_api import sync_playwright\n\nwith sync_playwright() as p:\n    browser = p.chromium.launch(headless=True)\n    context = browser.new_context(\n        viewport={\"width\": 1920, \"height\": 1080},\n        user_agent=\"Mozilla/5.0 ...\",\n    )\n    page = context.new_page()\n    \n    # Block unnecessary resources (speed + stealth)\n    page.route(\"**/*.{png,jpg,jpeg,gif,svg,woff,woff2}\", \n               lambda route: route.abort())\n    \n    page.goto(url, wait_until=\"networkidle\")\n    \n    # Wait for specific content (better than arbitrary sleep)\n    page.wait_for_selector('[data-product-id]', timeout=10000)\n    \n    # Extract after JS rendering\n    content = page.content()\n    # ... parse with BeautifulSoup/Cheerio ...\n    \n    browser.close()\n\nPhase 5: Anti-Detection & Stealth\nDetection Signals (What Sites Check)\nSignal\tDetection Method\tMitigation\nIP reputation\tIP blacklists, datacenter ranges\tResidential proxies\nRequest rate\tRequests/min from same IP\tRate limiting + jitter\nTLS fingerprint\tJA3/JA4 hash matching\tUse real browser or curl-impersonate\nBrowser fingerprint\tCanvas, WebGL, fonts\tPlaywright with stealth plugin\nJavaScript challenges\tCloudflare Turnstile, hCaptcha\tManaged browser services\nCookie/session behavior\tMissing cookies, no history\tFull session management\nNavigation pattern\tDirect URL hits, no referrer\tSimulate natural browsing\nMouse/keyboard events\tNo interaction telemetry\tEvent simulation (Playwright)\nHeader consistency\tMismatched headers vs UA\tHeader sets that match\nProxy Strategy\nproxy_strategy:\n  # Tier 1: Free/Datacenter (for non-protected sites)\n  basic:\n    type: \"datacenter\"\n    cost: \"$1-5/GB\"\n    success_rate: \"60-80%\"\n    use_for: \"APIs, small sites, no anti-bot\"\n    \n  # Tier 2: Residential (for most protected sites)\n  standard:\n    type: \"residential\"\n    cost: \"$5-15/GB\"\n    success_rate: \"90-95%\"\n    use_for: \"Cloudflare, major platforms\"\n    rotation: \"per-request or sticky 10min\"\n    \n  # Tier 3: Mobile/ISP (for maximum stealth)\n  premium:\n    type: \"mobile\"\n    cost: \"$15-30/GB\"\n    success_rate: \"95-99%\"\n    use_for: \"Aggressive anti-bot, social media\"\n    \n  rules:\n    - Start with cheapest tier, escalate only on blocks\n    - Match proxy geo to target audience geo\n    - Rotate on 403/429, not every request\n    - Use sticky sessions for multi-page scrapes\n    - Monitor proxy health — remove slow/blocked IPs\n\nPlaywright Stealth Configuration\n# Essential stealth for Playwright\nfrom playwright.sync_api import sync_playwright\n\nwith sync_playwright() as p:\n    browser = p.chromium.launch(\n        headless=True,\n        args=[\n            '--disable-blink-features=AutomationControlled',\n            '--disable-features=IsolateOrigins,site-per-process',\n        ]\n    )\n    context = browser.new_context(\n        viewport={\"width\": 1920, \"height\": 1080},\n        locale=\"en-US\",\n        timezone_id=\"America/New_York\",\n        geolocation={\"latitude\": 40.7128, \"longitude\": -74.0060},\n        permissions=[\"geolocation\"],\n    )\n    \n    # Remove automation indicators\n    page = context.new_page()\n    page.add_init_script(\"\"\"\n        Object.defineProperty(navigator, 'webdriver', {get: () => undefined});\n        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]});\n    \"\"\")\n\nCloudflare Bypass Decision\nCloudflare detected?\n├── JS Challenge only → Playwright with stealth + residential proxy\n├── Turnstile CAPTCHA → Managed service (ScrapingBee/Bright Data)\n├── Under Attack Mode → Wait, try later, or managed service\n└── WAF blocking → Different approach needed\n    ├── Check for API endpoints (network tab)\n    ├── Check for mobile app API\n    └── Consider if data is available elsewhere\n\nPhase 6: Data Pipeline & Quality\nData Validation Rules\n# Validation pattern — validate BEFORE storing\nfrom dataclasses import dataclass, field\nfrom typing import Optional\nimport re\nfrom datetime import datetime\n\n@dataclass\nclass ScrapedProduct:\n    url: str\n    title: str\n    price: Optional[float]\n    currency: str = \"USD\"\n    scraped_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())\n    \n    def validate(self) -> list[str]:\n        errors = []\n        if not self.url.startswith('http'):\n            errors.append(\"Invalid URL\")\n        if not self.title or len(self.title) < 3:\n            errors.append(\"Title too short or missing\")\n        if self.price is not None and self.price < 0:\n            errors.append(\"Negative price\")\n        if self.price is not None and self.price > 1_000_000:\n            errors.append(\"Price suspiciously high — verify\")\n        if self.currency not in (\"USD\", \"EUR\", \"GBP\", \"BTC\"):\n            errors.append(f\"Unknown currency: {self.currency}\")\n        return errors\n\nDeduplication Strategy\nMethod\tWhen to Use\tImplementation\nURL-based\tPages with unique URLs\tHash the canonical URL\nContent hash\tSame URL, changing content\tMD5/SHA256 of key fields\nFuzzy matching\tNear-duplicate detection\tJaccard similarity > 0.85\nComposite key\tMulti-field uniqueness\tHash(domain + product_id + variant)\nimport hashlib\n\ndef dedup_key(item: dict, fields: list[str]) -> str:\n    \"\"\"Generate dedup key from selected fields.\"\"\"\n    values = \"|\".join(str(item.get(f, \"\")) for f in fields)\n    return hashlib.sha256(values.encode()).hexdigest()\n\n# Usage\nseen = set()\nfor item in scraped_items:\n    key = dedup_key(item, [\"url\", \"product_id\"])\n    if key not in seen:\n        seen.add(key)\n        clean_items.append(item)\n\nData Cleaning Pipeline\nRaw HTML → Parse → Extract → Validate → Clean → Deduplicate → Store\n                                ↓\n                          Quarantine (failed validation)\n\n\nCommon cleaning operations:\n\nProblem\tSolution\nHTML entities (&amp;)\thtml.unescape()\nExtra whitespace\t\" \".join(text.split())\nUnicode issues\tunicodedata.normalize('NFKD', text)\nPrice in text (\"$49.99\")\tRegex: r'[\\$£€]?([\\d,]+\\.?\\d*)'\nDate formats vary\tdateutil.parser.parse() with dayfirst flag\nRelative URLs\turllib.parse.urljoin(base, relative)\nEncoding issues\tchardet.detect() then decode\nPhase 7: Storage & Export\nStorage Decision Guide\nVolume\tFrequency\tQuery Needs\tRecommendation\n<10K records\tOne-time\tNone\tJSON/CSV files\n<10K records\tRecurring\tSimple lookups\tSQLite\n10K-1M records\tRecurring\tComplex queries\tPostgreSQL\n1M+ records\tContinuous\tAnalytics\tPostgreSQL + partitioning\nAppend-only logs\tContinuous\tTime-series\tClickHouse / TimescaleDB\nSQLite Pattern (Most Common)\nimport sqlite3\nimport json\nfrom datetime import datetime\n\ndef init_db(path=\"scraper_data.db\"):\n    conn = sqlite3.connect(path)\n    conn.execute(\"\"\"\n        CREATE TABLE IF NOT EXISTS items (\n            id INTEGER PRIMARY KEY,\n            url TEXT UNIQUE,\n            data JSON NOT NULL,\n            scraped_at TEXT DEFAULT (datetime('now')),\n            updated_at TEXT,\n            checksum TEXT\n        )\n    \"\"\")\n    conn.execute(\"CREATE INDEX IF NOT EXISTS idx_url ON items(url)\")\n    conn.execute(\"CREATE INDEX IF NOT EXISTS idx_scraped ON items(scraped_at)\")\n    return conn\n\ndef upsert(conn, url, data, checksum):\n    conn.execute(\"\"\"\n        INSERT INTO items (url, data, checksum) VALUES (?, ?, ?)\n        ON CONFLICT(url) DO UPDATE SET\n            data = excluded.data,\n            updated_at = datetime('now'),\n            checksum = excluded.checksum\n        WHERE items.checksum != excluded.checksum\n    \"\"\", (url, json.dumps(data), checksum))\n    conn.commit()\n\nExport Formats\n# CSV export\nimport csv\ndef to_csv(items, path, fields):\n    with open(path, 'w', newline='') as f:\n        writer = csv.DictWriter(f, fieldnames=fields)\n        writer.writeheader()\n        writer.writerows(items)\n\n# JSON Lines (best for large datasets — streaming)\ndef to_jsonl(items, path):\n    with open(path, 'w') as f:\n        for item in items:\n            f.write(json.dumps(item) + '\\n')\n\n# Incremental export (only new/changed since last export)\ndef export_since(conn, last_export_time):\n    cursor = conn.execute(\n        \"SELECT data FROM items WHERE scraped_at > ? OR updated_at > ?\",\n        (last_export_time, last_export_time)\n    )\n    return [json.loads(row[0]) for row in cursor]\n\nPhase 8: Error Handling & Resilience\nError Classification\nHTTP Code\tMeaning\tAction\n200\tSuccess\tProcess normally\n301/302\tRedirect\tFollow (max 5 hops)\n403\tForbidden/blocked\tRotate proxy, slow down\n404\tNot found\tLog, skip, mark URL dead\n429\tRate limited\tRespect Retry-After, back off 2x\n500-504\tServer error\tRetry 3x with backoff\nConnection timeout\tNetwork issue\tRetry with different proxy\nSSL error\tCertificate issue\tLog, investigate, skip\nCircuit Breaker Pattern\nclass CircuitBreaker:\n    def __init__(self, failure_threshold=5, reset_timeout=300):\n        self.failures = 0\n        self.threshold = failure_threshold\n        self.reset_timeout = reset_timeout\n        self.last_failure = 0\n        self.state = \"closed\"  # closed | open | half-open\n    \n    def record_failure(self):\n        self.failures += 1\n        self.last_failure = time.time()\n        if self.failures >= self.threshold:\n            self.state = \"open\"\n            # Alert: \"Circuit open — too many failures\"\n    \n    def record_success(self):\n        self.failures = 0\n        self.state = \"closed\"\n    \n    def can_proceed(self):\n        if self.state == \"closed\":\n            return True\n        if self.state == \"open\":\n            if time.time() - self.last_failure > self.reset_timeout:\n                self.state = \"half-open\"\n                return True  # Try one request\n            return False\n        return True  # half-open: allow attempt\n\nCheckpoint & Resume\nimport json\nfrom pathlib import Path\n\nclass Checkpointer:\n    def __init__(self, path=\"checkpoint.json\"):\n        self.path = Path(path)\n        self.state = self._load()\n    \n    def _load(self):\n        if self.path.exists():\n            return json.loads(self.path.read_text())\n        return {\"completed_urls\": [], \"last_page\": 0, \"cursor\": None}\n    \n    def save(self):\n        self.path.write_text(json.dumps(self.state))\n    \n    def is_done(self, url):\n        return url in self.state[\"completed_urls\"]\n    \n    def mark_done(self, url):\n        self.state[\"completed_urls\"].append(url)\n        if len(self.state[\"completed_urls\"]) % 50 == 0:\n            self.save()  # Periodic save\n\nPhase 9: Monitoring & Operations\nScraper Health Dashboard\ndashboard:\n  real_time:\n    - metric: \"requests_per_minute\"\n      alert_if: \"> 60 for small sites\"\n    - metric: \"success_rate\"\n      alert_if: \"< 90%\"\n    - metric: \"avg_response_time_ms\"\n      alert_if: \"> 5000\"\n    - metric: \"blocked_rate\"\n      alert_if: \"> 10%\"\n      \n  per_run:\n    - metric: \"pages_scraped\"\n    - metric: \"items_extracted\"\n    - metric: \"items_validated\"\n    - metric: \"items_deduplicated\"\n    - metric: \"new_items\"\n    - metric: \"updated_items\"\n    - metric: \"errors_by_type\"\n    - metric: \"run_duration\"\n    - metric: \"proxy_cost\"\n    \n  weekly:\n    - metric: \"data_freshness\"\n      description: \"% of records updated in last 7 days\"\n    - metric: \"site_structure_changes\"\n      description: \"Selectors that stopped matching\"\n    - metric: \"total_cost\"\n      description: \"Proxy + compute + storage\"\n\nBreakage Detection\n\nSites redesign. Selectors break. Detect it early:\n\ndef health_check(results: list[dict], expected_fields: list[str]) -> dict:\n    \"\"\"Check if scraper is still extracting correctly.\"\"\"\n    total = len(results)\n    if total == 0:\n        return {\"status\": \"CRITICAL\", \"message\": \"Zero results — likely broken\"}\n    \n    field_coverage = {}\n    for field in expected_fields:\n        filled = sum(1 for r in results if r.get(field))\n        coverage = filled / total\n        field_coverage[field] = coverage\n        \n    issues = []\n    for field, coverage in field_coverage.items():\n        if coverage < 0.5:\n            issues.append(f\"{field}: {coverage:.0%} fill rate (expected >50%)\")\n    \n    if issues:\n        return {\"status\": \"WARNING\", \"issues\": issues}\n    return {\"status\": \"OK\", \"field_coverage\": field_coverage}\n\nOperational Runbook\n\nDaily:\n\nCheck success rate per target domain\nReview error logs for new patterns\nVerify data freshness\n\nWeekly:\n\nCompare extraction counts vs baseline (>20% drop = investigate)\nReview proxy spend\nSpot-check 10 random records for accuracy\n\nMonthly:\n\nFull selector validation against live pages\nReview legal compliance (robots.txt changes, ToS updates)\nCost optimization review\nPrune dead URLs from queue\nPhase 10: Common Scraping Patterns\nPattern 1: E-commerce Price Monitor\nuse_case: \"Track competitor prices daily\"\ntool: \"requests + BeautifulSoup\"\nschedule: \"Daily at 03:00 UTC (off-peak)\"\ntargets: [\"competitor-a.com/products\", \"competitor-b.com/api\"]\ndata:\n  - product_id\n  - product_name\n  - price\n  - currency\n  - in_stock\n  - scraped_at\nstorage: \"SQLite with price history\"\nalerts: \"Price change > 10% → notify\"\n\nPattern 2: Job Board Aggregator\nuse_case: \"Aggregate job listings from multiple boards\"\ntool: \"Scrapy with per-site spiders\"\nschedule: \"Every 6 hours\"\ntargets: [\"board-a.com\", \"board-b.com\", \"board-c.com\"]\ndata:\n  - title\n  - company\n  - location\n  - salary_range\n  - posted_date\n  - url\n  - source\ndedup: \"Hash(title + company + location)\"\nstorage: \"PostgreSQL\"\n\nPattern 3: News & Content Monitor\nuse_case: \"Monitor industry news mentions\"\ntool: \"requests + RSS feeds (preferred) + web fallback\"\nschedule: \"Every 30 minutes\"\napproach:\n  1: \"RSS/Atom feeds (fastest, cleanest)\"\n  2: \"Google News RSS for topic\"\n  3: \"Direct scraping if no feed\"\ndata:\n  - headline\n  - source\n  - url\n  - published_at\n  - snippet\n  - sentiment\nalerts: \"Keyword match → immediate notification\"\n\nPattern 4: Social Media Intelligence\nuse_case: \"Track brand mentions and sentiment\"\ntool: \"Official APIs (always) + web search fallback\"\nrules:\n  - NEVER scrape social platforms directly — use APIs\n  - Twitter/X: Official API ($100/mo basic)\n  - Reddit: Official API (free tier available)\n  - LinkedIn: No scraping (aggressive legal action)\n  - Instagram: Official API only (Meta Business)\nfallback: \"Brave/Google search for public mentions\"\n\nPattern 5: Real Estate Listings\nuse_case: \"Track property listings and prices\"\ntool: \"Playwright (most listing sites are JS-heavy)\"\nschedule: \"Daily\"\nchallenges:\n  - Heavy JavaScript rendering\n  - Anti-bot measures (Cloudflare common)\n  - Frequent layout changes\n  - Map-based results\napproach: \"API endpoint discovery via network tab first\"\n\nPhase 11: Scaling Strategies\nConcurrency Architecture\nSingle machine (small scale):\n├── asyncio + aiohttp (Python) → 50-200 concurrent requests\n├── Worker pool (ThreadPoolExecutor) → 10-50 threads\n└── Scrapy reactor → Built-in concurrency\n\nMulti-machine (large scale):\n├── URL queue: Redis / RabbitMQ / SQS\n├── Workers: Multiple Scrapy/custom workers\n├── Results: Shared PostgreSQL / S3\n└── Coordinator: Celery / custom scheduler\n\nCost Optimization\nLever\tImpact\tHow\nStatic > Browser\t10-50x cheaper\tAlways try HTTP first\nBlock images/CSS/fonts\t60-80% bandwidth saved\tRoute filtering\nCache DNS\tMinor but cumulative\tLocal DNS cache\nCompress responses\t50-70% bandwidth\tAccept-Encoding: gzip, br\nSmart scheduling\tAvoid redundant scrapes\tChange detection before full re-scrape\nProxy tier matching\t3-10x cost difference\tDon't use residential for easy sites\nPhase 12: Advanced Patterns\nAPI Discovery (Network Tab Mining)\n\nBefore building a scraper, check if the site has hidden API endpoints:\n\nOpen DevTools → Network tab\nFilter by XHR/Fetch\nNavigate the site, click load-more, filter/sort\nLook for JSON responses — these are your goldmine\nMost SPAs load data via REST/GraphQL APIs\n\nCommon hidden API patterns:\n\n/api/v1/products?page=1&limit=20\n/graphql with query parameters\n/_next/data/... (Next.js data routes)\n/wp-json/wp/v2/posts (WordPress)\nHeadless Browser Optimization\n# Minimize browser resource usage\ncontext = browser.new_context(\n    viewport={\"width\": 1280, \"height\": 720},\n    java_script_enabled=True,  # Only if needed\n    has_touch=False,\n    is_mobile=False,\n)\n\n# Block resource types you don't need\npage.route(\"**/*\", lambda route: (\n    route.abort() if route.request.resource_type in \n    [\"image\", \"stylesheet\", \"font\", \"media\"] \n    else route.continue_()\n))\n\nScraping Behind Authentication\n# When authorized to scrape behind login\n# ALWAYS use session-based auth, never store passwords in code\n\n# Pattern: Login once, reuse session\nsession = requests.Session()\nlogin_resp = session.post(\"https://example.com/login\", data={\n    \"username\": os.environ[\"SCRAPE_USER\"],\n    \"password\": os.environ[\"SCRAPE_PASS\"],\n})\nassert login_resp.ok, \"Login failed\"\n\n# Session cookies are now stored — use for subsequent requests\ndata_resp = session.get(\"https://example.com/api/data\")\n\nChange Detection (Avoid Redundant Scrapes)\nimport hashlib\n\ndef has_changed(url, session, last_etag=None, last_modified=None):\n    \"\"\"Check if page changed without downloading full content.\"\"\"\n    headers = {}\n    if last_etag:\n        headers[\"If-None-Match\"] = last_etag\n    if last_modified:\n        headers[\"If-Modified-Since\"] = last_modified\n    \n    resp = session.head(url, headers=headers)\n    \n    if resp.status_code == 304:\n        return False, resp.headers.get(\"ETag\"), resp.headers.get(\"Last-Modified\")\n    \n    return True, resp.headers.get(\"ETag\"), resp.headers.get(\"Last-Modified\")\n\nQuality Scoring Rubric (0-100)\nDimension\tWeight\tWhat to Assess\nLegal compliance\t20%\trobots.txt, ToS, PII handling, audit trail\nData quality\t20%\tValidation, accuracy, completeness, freshness\nResilience\t15%\tError handling, retries, circuit breakers, checkpointing\nAnti-detection\t15%\tProxy rotation, fingerprint diversity, rate limiting\nArchitecture\t10%\tRight tool selection, clean code, modularity\nMonitoring\t10%\tSuccess rates, breakage detection, alerting\nPerformance\t5%\tSpeed, cost efficiency, resource usage\nDocumentation\t5%\tRunbook, schema docs, legal assessment\n\nGrading: 90+ Excellent | 75-89 Good | 60-74 Needs work | <60 Redesign\n\n10 Common Mistakes\n#\tMistake\tFix\n1\tNo robots.txt check\tAlways check first — it's your legal defense\n2\tFixed delays (no jitter)\tAdd ±30% random jitter to all delays\n3\tNo data validation\tValidate every field before storing\n4\tUsing browser for static HTML\tHTTP client is 10-50x faster and cheaper\n5\tSingle IP, no rotation\tProxy rotation for any serious scraping\n6\tNo breakage detection\tMonitor extraction counts and field fill rates\n7\tStoring raw HTML only\tExtract + structure immediately\n8\tNo checkpoint/resume\tLong scrapes must be resumable\n9\tIgnoring structured data\tJSON-LD/microdata is cleaner than CSS selectors\n10\tScraping when API exists\tAlways check for API first\n5 Edge Cases\n\nSingle-page apps (React/Vue/Angular): Must use browser rendering OR find the underlying API (network tab). Prefer API discovery — it's faster and more reliable.\n\nInfinite scroll: Intercept the XHR/fetch calls that load more content. Simulate scrolling only as last resort. The API endpoint usually accepts page or offset params.\n\nCAPTCHAs: If you're hitting CAPTCHAs, you're scraping too aggressively. Slow down first. If CAPTCHAs persist: managed services (2Captcha, Anti-Captcha) or rethink approach.\n\nDynamic class names (CSS modules, Tailwind): Use data attributes, ARIA labels, or text content selectors instead. [data-testid=\"price\"] survives redesigns. .sc-bdVTJa does not.\n\nMulti-language sites: Detect language via html[lang] attribute. Set Accept-Language header to get desired locale. Watch for different URL structures (/en/, /de/, subdomains).\n\nNatural Language Commands\n\"Check if I can scrape [URL]\" → Run compliance checklist (robots.txt, ToS, data type)\n\"What tool should I use for [site]?\" → Analyze site rendering, anti-bot, recommend tool\n\"Build a scraper for [description]\" → Full architecture brief + code pattern\n\"My scraper is getting blocked\" → Anti-detection diagnostic + proxy/stealth recommendations\n\"Extract [data] from [URL]\" → Check structured data first, then CSS selectors\n\"Monitor [site] for changes\" → Change detection + scheduling + alerting setup\n\"How do I handle pagination on [site]?\" → Identify pagination type + code pattern\n\"Scrape at scale ([N] pages)\" → Concurrency architecture + cost estimate\n\"Clean and store this scraped data\" → Validation + dedup + storage recommendation\n\"Is my scraper healthy?\" → Run health check + breakage detection\n\"Find the API behind [site]\" → Network tab mining guide + common patterns\n\"Set up price monitoring for [competitors]\" → Full e-commerce monitor pattern"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-web-scraping-engine",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-web-scraping-engine",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-web-scraping-engine",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-web-scraping-engine/agent.md"
  }
}