{
  "schemaVersion": "1.0",
  "item": {
    "slug": "arxiv-search-collector",
    "name": "Arxiv Search Collector",
    "source": "tencent",
    "type": "skill",
    "category": "开发工具",
    "sourceUrl": "https://clawhub.ai/xukp20/arxiv-search-collector",
    "canonicalUrl": "https://clawhub.ai/xukp20/arxiv-search-collector",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/arxiv-search-collector",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=arxiv-search-collector",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "scripts/fetch_queries_batch.py",
      "scripts/fetch_query_metadata.py",
      "scripts/init_collection_run.py",
      "scripts/merge_selected_papers.py",
      "references/io-contract.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/arxiv-search-collector"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/arxiv-search-collector",
    "agentPageUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent",
    "manifestUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "ArXiv Search Collector",
        "body": "Use this skill when you want model-led query planning and model-led relevance filtering."
      },
      {
        "title": "Core Principle",
        "body": "Scripts are tools. The model performs the reasoning and decisions:\n\nExpand the original topic into multiple focused queries.\nRun one fetch command per query.\nRead each query result list and decide keep indexes.\nMerge kept items and dedupe with one script."
      },
      {
        "title": "Step 1: Initialize Run",
        "body": "python3 scripts/init_collection_run.py \\\n  --output-root /path/to/data \\\n  --topic \"LLM applications in Lean 4 formalization\" \\\n  --keywords \"Lean 4,LLM,formalization\" \\\n  --categories \"cs.AI,cs.LO\" \\\n  --target-range 5-10 \\\n  --lookback 30d \\\n  --language English\n\nThis creates a run directory with task_meta.json, task_meta.md, query_results/, and query_selection/."
      },
      {
        "title": "Language Parameter",
        "body": "--language must be set manually for each collection run.\nUse the same language value across all collector scripts for consistency.\nIf --language is non-English (for example Chinese), generated markdown files are written in that language:\n\ntask_meta.md\nquery_results/<label>.md\n<arxiv_id>/metadata.md\npapers_index.md"
      },
      {
        "title": "Query Writing Requirements",
        "body": "Follow these rules before running per-query fetch:\n\nDetermine query count from final target range.\n\nPrefer 3 queries for small/medium targets (2-5, 5-10).\nPrefer 4 queries for larger targets (10-50 or above).\nAvoid writing too many low-quality queries.\n\nAllocate target budget to each query, then oversample.\n\nLet target_max be the upper bound in target range.\nCompute target_per_query = ceil(target_max / query_count).\nFetch each query with max_results = target_per_query * 2 (or * 3 when recall is more important).\nExample: target 5-10, query count 3 -> target_per_query=4 -> each query fetches 8-12.\n\nKeep one original-theme query, then add normalized/synonym expansions.\n\nQuery 1 keeps original topic wording.\nRemaining queries use normalized terms and close synonyms.\nPrefer concise noun phrases that match arXiv indexing behavior.\n\nUse OR inside the same semantic group (synonyms), and AND across groups.\n\nSame-group synonyms should be connected with OR to increase recall.\n\nExample group A (model terms): LLM OR \"large language model\" OR AI.\nExample group B (Lean terms): \"Lean 4\" OR Lean OR \"formal language\".\n\n\nDifferent semantic groups should be connected with AND to keep relevance.\n\nExample: (LLM-group) AND (Lean-group).\n\n\nRecommended pattern:\n\n(<domain terms with OR>) AND (<method/model terms with OR>) [AND <optional constraint terms>]"
      },
      {
        "title": "Query Examples (arXiv API-ready)",
        "body": "Theme A: LLM applications in Lean 4 formalization\n\nall:\"LLM applications in Lean 4 formalization\"\n(all:\"Lean 4\" OR all:\"Lean\" OR all:\"formal language\") AND (all:\"LLM\" OR all:\"large language model\" OR all:\"AI\")\n(all:\"Lean\" OR all:\"formalization\") AND (all:\"LLM\" OR all:\"large language model\") AND all:\"theorem proving\"\n(all:\"Lean\" OR all:\"proof assistant\") AND (all:\"AI\" OR all:\"LLM\")\n\nTheme B: agentic tool use for code generation\n\nall:\"agentic tool use code generation\"\n(all:\"agentic\" OR all:\"autonomous agent\") AND (all:\"LLM\" OR all:\"large language model\")\n(all:\"tool use\" OR all:\"function calling\") AND (all:\"coding assistant\" OR all:\"code generation\")\n\nTheme C: multimodal reasoning with retrieval\n\nall:\"multimodal reasoning retrieval\"\n(all:\"multimodal\" OR all:\"vision language\") AND (all:\"retrieval\" OR all:\"RAG\")\n(all:\"multimodal model\" OR all:\"vision language model\") AND (all:\"reasoning\" OR all:\"tool use\")"
      },
      {
        "title": "Step 2: Fetch One Query at a Time",
        "body": "Model defines queries manually, for example:\n\nall:\"Lean 4\"\nall:\"LLM formalization\"\nall:\"AI formal verification\"\n\nRecommended batch mode (safe defaults, serial execution):\n\npython3 scripts/fetch_queries_batch.py \\\n  --run-dir /path/to/run-dir \\\n  --plan-json /path/to/query_plan.json\n\nIn batch mode, the script auto-applies:\n\nserial API calls\n--min-interval-sec 5\n--retry-max 4\n--retry-base-sec 5\n--retry-max-sec 120\n--retry-jitter-sec 1\nper-run rate-state file (<run_dir>/.runtime/arxiv_api_state.json) for throttling\nauto max_results from target_range and query count (default oversample x2, cap 60)\ndefault language/categories from task_meta.json\n\nMinimal query_plan.json only needs label and query.\nSee references/query-plan-format.md.\nYou normally do not need to set fetch-control args manually.\n\nIf you need one-by-one manual fetch, run each query:\n\npython3 scripts/fetch_query_metadata.py \\\n  --run-dir /path/to/run-dir \\\n  --label lean4 \\\n  --query 'all:\"Lean 4\"' \\\n  --max-results 30 \\\n  --min-interval-sec 5 \\\n  --retry-max 4 \\\n  --language English\n\nOutput files:\n\nquery_results/<label>.json (indexed full metadata list)\nquery_results/<label>.md (human-readable preview)\n\nDate range is applied directly in arXiv API search_query via submittedDate:[... TO ...].\nNo second local date-filter pass is performed.\n\nRate-limit controls in fetch_query_metadata.py:\n\n--min-interval-sec (default 5.0)\n--retry-max (default 4)\n--retry-base-sec (default 5.0)\n--retry-max-sec (default 120.0)\n--retry-jitter-sec (default 1.0)\n--rate-state-path (optional override; default is <run_dir>/.runtime/arxiv_api_state.json)\n--force to bypass cache and re-fetch"
      },
      {
        "title": "Step 3: Model Filters Relevance",
        "body": "For each query list, the model reads indexed results and decides what to keep.\n\nUse keep specs by index and/or arXiv ID when merging.\nTo explicitly drop one weak query in later iterations, set that label to an empty keep list in selection-json."
      },
      {
        "title": "Step 4: Merge and Dedupe",
        "body": "python3 scripts/merge_selected_papers.py \\\n  --run-dir /path/to/run-dir \\\n  --keep lean4:0,2,4 \\\n  --keep llm-formalization:1,3 \\\n  --language English\n\nor with selection-json:\n\n{\n  \"lean4-round1\": [0, 2, 4],\n  \"lean4-round2\": [],\n  \"formalization-round2\": [1, 3, 5]\n}\n\nAn empty list means this query label is intentionally dropped (keep 0).\n\nThis writes final outputs:\n\n<arxiv_id>/metadata.json\n<arxiv_id>/metadata.md\npapers_index.json\npapers_index.md"
      },
      {
        "title": "Step 5: Iterative Retry Loop (Incremental)",
        "body": "If relevance is weak or final count is insufficient after Step 4, iterate:\n\nReview papers_index.md and per-paper metadata quality.\nAdjust query plan (usually broaden with additional synonym OR terms, keep cross-group AND constraints).\nFetch additional query results with new labels.\nRe-run merge in incremental mode:\n\npython3 scripts/merge_selected_papers.py \\\n  --run-dir /path/to/run-dir \\\n  --incremental \\\n  --selection-json /path/to/updated_selection.json \\\n  --language English\n\nIncremental behavior:\n\nPrevious label selections are loaded from query_selection/selected_by_query.json.\nLabels provided in the new selection-json override previous selections for those labels.\nNew labels can be added.\nOld labels can be dropped by setting [].\n\nStop retrying when:\n\nrelevance is acceptable, or\nadditional broadened queries mainly add low-relevance papers.\n\nIf relevant papers are genuinely scarce, it is valid to finish below the original minimum target range."
      },
      {
        "title": "Notes",
        "body": "Keep API concurrency conservative by controlling query count and --max-results.\nKeep per-query fetch serial (no parallel API calls in Stage A).\nReuse cache by default for identical query/date/request settings; only use --force when necessary.\nPrefer default run-local rate-state so all steps in the same run share one cooldown/throttling state.\nIf arXiv API returns 429 Too Many Requests, retry later and/or increase --min-interval-sec.\nPrefer explicit, narrow queries and let the model filter aggressively.\nUse references/io-contract.md for exact files and schema."
      },
      {
        "title": "Related Skills",
        "body": "This skill is a sub-skill of arxiv-summarizer-orchestrator.\n\nPipeline position:\n\nStep 1 (collection): arxiv-search-collector (this skill)\nStep 2 (per-paper processing): arxiv-paper-processor\nStep 3 (batch reporting): arxiv-batch-reporter\n\nThis skill produces the initial paper-set structure and metadata that Stage B and Stage C depend on."
      }
    ],
    "body": "ArXiv Search Collector\n\nUse this skill when you want model-led query planning and model-led relevance filtering.\n\nCore Principle\n\nScripts are tools. The model performs the reasoning and decisions:\n\nExpand the original topic into multiple focused queries.\nRun one fetch command per query.\nRead each query result list and decide keep indexes.\nMerge kept items and dedupe with one script.\nStep 1: Initialize Run\npython3 scripts/init_collection_run.py \\\n  --output-root /path/to/data \\\n  --topic \"LLM applications in Lean 4 formalization\" \\\n  --keywords \"Lean 4,LLM,formalization\" \\\n  --categories \"cs.AI,cs.LO\" \\\n  --target-range 5-10 \\\n  --lookback 30d \\\n  --language English\n\n\nThis creates a run directory with task_meta.json, task_meta.md, query_results/, and query_selection/.\n\nLanguage Parameter\n--language must be set manually for each collection run.\nUse the same language value across all collector scripts for consistency.\nIf --language is non-English (for example Chinese), generated markdown files are written in that language:\ntask_meta.md\nquery_results/<label>.md\n<arxiv_id>/metadata.md\npapers_index.md\nQuery Writing Requirements\n\nFollow these rules before running per-query fetch:\n\nDetermine query count from final target range.\nPrefer 3 queries for small/medium targets (2-5, 5-10).\nPrefer 4 queries for larger targets (10-50 or above).\nAvoid writing too many low-quality queries.\nAllocate target budget to each query, then oversample.\nLet target_max be the upper bound in target range.\nCompute target_per_query = ceil(target_max / query_count).\nFetch each query with max_results = target_per_query * 2 (or * 3 when recall is more important).\nExample: target 5-10, query count 3 -> target_per_query=4 -> each query fetches 8-12.\nKeep one original-theme query, then add normalized/synonym expansions.\nQuery 1 keeps original topic wording.\nRemaining queries use normalized terms and close synonyms.\nPrefer concise noun phrases that match arXiv indexing behavior.\nUse OR inside the same semantic group (synonyms), and AND across groups.\nSame-group synonyms should be connected with OR to increase recall.\nExample group A (model terms): LLM OR \"large language model\" OR AI.\nExample group B (Lean terms): \"Lean 4\" OR Lean OR \"formal language\".\nDifferent semantic groups should be connected with AND to keep relevance.\nExample: (LLM-group) AND (Lean-group).\nRecommended pattern:\n(<domain terms with OR>) AND (<method/model terms with OR>) [AND <optional constraint terms>]\nQuery Examples (arXiv API-ready)\n\nTheme A: LLM applications in Lean 4 formalization\n\nall:\"LLM applications in Lean 4 formalization\"\n(all:\"Lean 4\" OR all:\"Lean\" OR all:\"formal language\") AND (all:\"LLM\" OR all:\"large language model\" OR all:\"AI\")\n(all:\"Lean\" OR all:\"formalization\") AND (all:\"LLM\" OR all:\"large language model\") AND all:\"theorem proving\"\n(all:\"Lean\" OR all:\"proof assistant\") AND (all:\"AI\" OR all:\"LLM\")\n\nTheme B: agentic tool use for code generation\n\nall:\"agentic tool use code generation\"\n(all:\"agentic\" OR all:\"autonomous agent\") AND (all:\"LLM\" OR all:\"large language model\")\n(all:\"tool use\" OR all:\"function calling\") AND (all:\"coding assistant\" OR all:\"code generation\")\n\nTheme C: multimodal reasoning with retrieval\n\nall:\"multimodal reasoning retrieval\"\n(all:\"multimodal\" OR all:\"vision language\") AND (all:\"retrieval\" OR all:\"RAG\")\n(all:\"multimodal model\" OR all:\"vision language model\") AND (all:\"reasoning\" OR all:\"tool use\")\nStep 2: Fetch One Query at a Time\n\nModel defines queries manually, for example:\n\nall:\"Lean 4\"\nall:\"LLM formalization\"\nall:\"AI formal verification\"\n\nRecommended batch mode (safe defaults, serial execution):\n\npython3 scripts/fetch_queries_batch.py \\\n  --run-dir /path/to/run-dir \\\n  --plan-json /path/to/query_plan.json\n\n\nIn batch mode, the script auto-applies:\n\nserial API calls\n--min-interval-sec 5\n--retry-max 4\n--retry-base-sec 5\n--retry-max-sec 120\n--retry-jitter-sec 1\nper-run rate-state file (<run_dir>/.runtime/arxiv_api_state.json) for throttling\nauto max_results from target_range and query count (default oversample x2, cap 60)\ndefault language/categories from task_meta.json\n\nMinimal query_plan.json only needs label and query. See references/query-plan-format.md. You normally do not need to set fetch-control args manually.\n\nIf you need one-by-one manual fetch, run each query:\n\npython3 scripts/fetch_query_metadata.py \\\n  --run-dir /path/to/run-dir \\\n  --label lean4 \\\n  --query 'all:\"Lean 4\"' \\\n  --max-results 30 \\\n  --min-interval-sec 5 \\\n  --retry-max 4 \\\n  --language English\n\n\nOutput files:\n\nquery_results/<label>.json (indexed full metadata list)\nquery_results/<label>.md (human-readable preview)\n\nDate range is applied directly in arXiv API search_query via submittedDate:[... TO ...]. No second local date-filter pass is performed.\n\nRate-limit controls in fetch_query_metadata.py:\n\n--min-interval-sec (default 5.0)\n--retry-max (default 4)\n--retry-base-sec (default 5.0)\n--retry-max-sec (default 120.0)\n--retry-jitter-sec (default 1.0)\n--rate-state-path (optional override; default is <run_dir>/.runtime/arxiv_api_state.json)\n--force to bypass cache and re-fetch\nStep 3: Model Filters Relevance\n\nFor each query list, the model reads indexed results and decides what to keep.\n\nUse keep specs by index and/or arXiv ID when merging. To explicitly drop one weak query in later iterations, set that label to an empty keep list in selection-json.\n\nStep 4: Merge and Dedupe\npython3 scripts/merge_selected_papers.py \\\n  --run-dir /path/to/run-dir \\\n  --keep lean4:0,2,4 \\\n  --keep llm-formalization:1,3 \\\n  --language English\n\n\nor with selection-json:\n\n{\n  \"lean4-round1\": [0, 2, 4],\n  \"lean4-round2\": [],\n  \"formalization-round2\": [1, 3, 5]\n}\n\n\nAn empty list means this query label is intentionally dropped (keep 0).\n\nThis writes final outputs:\n\n<arxiv_id>/metadata.json\n<arxiv_id>/metadata.md\npapers_index.json\npapers_index.md\nStep 5: Iterative Retry Loop (Incremental)\n\nIf relevance is weak or final count is insufficient after Step 4, iterate:\n\nReview papers_index.md and per-paper metadata quality.\nAdjust query plan (usually broaden with additional synonym OR terms, keep cross-group AND constraints).\nFetch additional query results with new labels.\nRe-run merge in incremental mode:\npython3 scripts/merge_selected_papers.py \\\n  --run-dir /path/to/run-dir \\\n  --incremental \\\n  --selection-json /path/to/updated_selection.json \\\n  --language English\n\n\nIncremental behavior:\n\nPrevious label selections are loaded from query_selection/selected_by_query.json.\nLabels provided in the new selection-json override previous selections for those labels.\nNew labels can be added.\nOld labels can be dropped by setting [].\n\nStop retrying when:\n\nrelevance is acceptable, or\nadditional broadened queries mainly add low-relevance papers.\n\nIf relevant papers are genuinely scarce, it is valid to finish below the original minimum target range.\n\nNotes\nKeep API concurrency conservative by controlling query count and --max-results.\nKeep per-query fetch serial (no parallel API calls in Stage A).\nReuse cache by default for identical query/date/request settings; only use --force when necessary.\nPrefer default run-local rate-state so all steps in the same run share one cooldown/throttling state.\nIf arXiv API returns 429 Too Many Requests, retry later and/or increase --min-interval-sec.\nPrefer explicit, narrow queries and let the model filter aggressively.\nUse references/io-contract.md for exact files and schema.\nRelated Skills\n\nThis skill is a sub-skill of arxiv-summarizer-orchestrator.\n\nPipeline position:\n\nStep 1 (collection): arxiv-search-collector (this skill)\nStep 2 (per-paper processing): arxiv-paper-processor\nStep 3 (batch reporting): arxiv-batch-reporter\n\nThis skill produces the initial paper-set structure and metadata that Stage B and Stage C depend on."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/xukp20/arxiv-search-collector",
    "publisherUrl": "https://clawhub.ai/xukp20/arxiv-search-collector",
    "owner": "xukp20",
    "version": "0.1.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/arxiv-search-collector",
    "downloadUrl": "https://openagent3.xyz/downloads/arxiv-search-collector",
    "agentUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent",
    "manifestUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/arxiv-search-collector/agent.md"
  }
}