{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-rag-engineering",
    "name": "RAG Engineering",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-rag-engineering",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-rag-engineering",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-rag-engineering",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-rag-engineering",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-rag-engineering"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-rag-engineering",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "RAG Engineering — Complete Retrieval-Augmented Generation System",
        "body": "Build production RAG systems that actually work. From chunking strategy to evaluation — the complete methodology.\n\nYou are an expert RAG engineer. When the user needs to build, optimize, or debug a RAG system, follow this complete methodology."
      },
      {
        "title": "Quick Health Check (Existing Systems)",
        "body": "SignalHealthyWarningCriticalAnswer relevance>85% users satisfied60-85%<60%Retrieval precision@5>70% relevant chunks40-70%<40%Hallucination rate<5%5-15%>15%Latency (P95)<3s3-8s>8sContext utilization>60% of retrieved used30-60%<30%Cost per query<$0.05$0.05-0.20>$0.20"
      },
      {
        "title": "RAG Project Brief",
        "body": "rag_brief:\n  project: \"[name]\"\n  date: \"YYYY-MM-DD\"\n\n  # What problem are we solving?\n  use_case: \"[customer support / code search / document Q&A / research / legal / medical]\"\n  user_persona: \"[who asks questions]\"\n  query_types:\n    - factual: \"[percentage] — direct fact lookup\"\n    - analytical: \"[percentage] — synthesis across documents\"\n    - procedural: \"[percentage] — how-to, step-by-step\"\n    - comparative: \"[percentage] — compare X vs Y\"\n    - conversational: \"[percentage] — multi-turn follow-ups\"\n\n  # What data do we have?\n  corpus:\n    total_documents: \"[count]\"\n    total_size: \"[GB/TB]\"\n    document_types:\n      - type: \"[PDF/HTML/markdown/code/JSON/CSV]\"\n        count: \"[count]\"\n        avg_length: \"[pages/tokens]\"\n    update_frequency: \"[static / daily / real-time]\"\n    languages: [\"en\", \"...\"]\n    quality: \"[curated / mixed / noisy]\"\n\n  # Requirements\n  accuracy_target: \"[% — start with 85%]\"\n  latency_target: \"[ms P95]\"\n    max_cost_per_query: \"[$]\"\n  scale: \"[queries/day]\"\n  multi_turn: \"[yes/no]\"\n  citations_required: \"[yes/no]\"\n\n  # Constraints\n  deployment: \"[cloud / on-prem / hybrid]\"\n  data_sensitivity: \"[public / internal / PII / regulated]\"\n  budget: \"[$/month for infrastructure]\""
      },
      {
        "title": "RAG Architecture Decision Tree",
        "body": "Is your corpus < 100 documents AND < 50 pages each?\n├─ YES → Consider full-context stuffing (no RAG needed)\n│        Use: Long-context model (Gemini 1M, Claude 200K)\n│        When: Static docs, low query volume, budget allows\n│\n└─ NO → RAG is appropriate\n         │\n         Is real-time freshness critical?\n         ├─ YES → Streaming RAG with incremental indexing\n         └─ NO → Batch-indexed RAG\n                  │\n                  Do queries need multi-step reasoning?\n                  ├─ YES → Agentic RAG (query planning + tool use)\n                  └─ NO → Standard retrieval pipeline\n                           │\n                           Single document type?\n                           ├─ YES → Single-index RAG\n                           └─ NO → Multi-index with routing"
      },
      {
        "title": "Architecture Patterns",
        "body": "PatternUse CaseComplexityQualityNaive RAGSimple Q&A, prototypesLowMediumAdvanced RAGProduction systemsMediumHighModular RAGComplex multi-sourceHighHighestAgentic RAGMulti-step researchHighestHighestGraph RAGEntity-heavy domainsHighHigh for relational queriesHybrid RAGMixed query typesMedium-HighHigh"
      },
      {
        "title": "Document Processing Pipeline",
        "body": "Raw Documents → Extraction → Cleaning → Enrichment → Chunking → Embedding → Indexing"
      },
      {
        "title": "Extraction Strategy by Document Type",
        "body": "Document TypeExtraction ToolKey ChallengesQuality TipsPDFPyMuPDF, Unstructured, DoclingTables, images, multi-columnUse layout-aware parser; OCR for scannedHTMLBeautifulSoup, TrafilaturaBoilerplate, navigationExtract main content only; preserve headersMarkdownDirect parseMinimalPreserve structure; handle frontmatterCodeTree-sitter, ASTContext lossInclude file path + imports as metadataCSV/JSONpandas, jqSchema understandingConvert rows to natural languageDOCX/PPTXpython-docx, python-pptxFormatting, embedded mediaExtract text + table structureImagesGPT-4V, Claude VisionOCR accuracyGenerate text descriptions; store as metadataAudio/VideoWhisper, AssemblyTimestamps, speakersChunk by speaker turn or topic segment"
      },
      {
        "title": "Cleaning Checklist",
        "body": "Remove headers/footers/page numbers (PDF artifacts)\n Normalize whitespace (collapse multiple spaces/newlines)\n Fix encoding issues (UTF-8 normalize)\n Remove boilerplate (disclaimers, repeated navigation)\n Preserve meaningful formatting (tables, lists, code blocks)\n Handle special characters and Unicode consistently\n Detect and flag low-quality documents (OCR confidence < 80%)\n Deduplicate (exact + near-duplicate detection)"
      },
      {
        "title": "Metadata Enrichment",
        "body": "Always extract and store:\n\ndocument_metadata:\n  source_id: \"[unique document identifier]\"\n  source_url: \"[original URL or file path]\"\n  title: \"[document title]\"\n  author: \"[if available]\"\n  created_date: \"[ISO 8601]\"\n  modified_date: \"[ISO 8601]\"\n  document_type: \"[pdf/html/md/code/...]\"\n  language: \"[ISO 639-1]\"\n  section_hierarchy: [\"Chapter\", \"Section\", \"Subsection\"]\n  tags: [\"auto-generated\", \"topic\", \"tags\"]\n  access_level: \"[public/internal/restricted]\"\n  quality_score: \"[0-100 from cleaning pipeline]\"\n\nEnrichment strategies:\n\nAuto-generate summaries per document (for hybrid search)\nExtract entities (people, companies, products, dates)\nClassify by topic/category\nGenerate hypothetical questions (HyDE technique at index time)"
      },
      {
        "title": "The Chunking Decision Is Critical",
        "body": "Bad chunking is the #1 cause of poor RAG quality. No amount of model sophistication fixes bad chunks."
      },
      {
        "title": "Chunking Method Selection",
        "body": "MethodBest ForChunk QualityImplementationFixed-sizeHomogeneous text, quick prototypeMediumSimpleRecursive characterGeneral purpose, structured docsGoodLangChain defaultSemanticVaried content, topic shiftsHighEmbedding-basedDocument-structureTechnical docs, legal, academicHighestCustom per doc typeAgentic/LLMHigh-value docs, complex structureHighestExpensiveSentence-windowDense factual contentGoodSentence + contextParent-childHierarchical docs, manualsHighTwo-level index"
      },
      {
        "title": "Chunking Decision Tree",
        "body": "Is your content highly structured (headers, sections, numbered)?\n├─ YES → Document-structure chunking\n│        Split on: H1 > H2 > H3 > paragraph boundaries\n│        Keep: section title chain as metadata\n│\n└─ NO → Is content topically diverse within documents?\n         ├─ YES → Semantic chunking\n         │        Split when: embedding similarity drops below threshold\n         │        Typical threshold: cosine similarity < 0.75\n         │\n         └─ NO → Recursive character splitting\n                  With: chunk_size=512, overlap=64 (tokens)\n                  Separators: [\"\\n\\n\", \"\\n\", \". \", \" \"]"
      },
      {
        "title": "Chunk Size Guidelines",
        "body": "Use CaseTarget TokensOverlapRationaleFactual Q&A256-51232-64Precise retrievalSummarization512-102464-128Broader contextCode searchFunction/class level0Natural boundariesLegal/regulatorySection/clause level1 sentencePreserve clause integrityConversational256-51264Quick, focused answersResearch/analysis1024-2048128-256Deep context"
      },
      {
        "title": "Chunk Quality Rules",
        "body": "Self-contained: A chunk should make sense on its own (add context headers if needed)\nAtomic: One main idea per chunk when possible\nRetrievable: Would this chunk be useful if a user searched for its content?\nNo orphans: Don't create chunks < 50 tokens (merge with neighbors)\nPreserve structure: Tables, code blocks, and lists should not be split mid-element\nContext prefix: Prepend document title + section hierarchy to each chunk"
      },
      {
        "title": "Parent-Child (Two-Level) Strategy",
        "body": "Parent chunks: 2048 tokens (stored for LLM context)\n  └─ Child chunks: 256 tokens (stored for retrieval)\n\nRetrieval: Search child chunks → Return parent chunk to LLM\nBenefit: Precise retrieval + rich context"
      },
      {
        "title": "Chunk Quality Scoring",
        "body": "Score each chunk (automated):\n\nDimensionWeight0 (Bad)5 (Good)10 (Great)Self-contained25%Sentence fragmentNeeds contextStandalone meaningfulInformation density25%Mostly boilerplateMixedDense, useful contentBoundary quality20%Mid-sentence splitParagraph boundarySection/topic boundaryMetadata completeness15%No metadataBasic fieldsFull enrichmentSize appropriateness15%<50 or >2048 tokensWithin rangeOptimal for use case\n\nTarget: Average chunk quality score > 7.0"
      },
      {
        "title": "Embedding Model Selection",
        "body": "ModelDimensionsMax TokensQualitySpeedCosttext-embedding-3-large (OpenAI)3072 (or 256-3072 via MRL)8191ExcellentFast$0.13/1M tokenstext-embedding-3-small (OpenAI)1536 (or 256-1536)8191GoodVery fast$0.02/1M tokensvoyage-3-large (Voyage)102432000ExcellentFast$0.18/1M tokensvoyage-code-3 (Voyage)102432000Best for codeFast$0.18/1M tokensCohere embed-v41024128000ExcellentFast$0.10/1M tokensBGE-M3 (open source)10248192Very goodSelf-hostFree (compute)nomic-embed-text (open source)7688192GoodSelf-hostFree (compute)GTE-Qwen2 (open source)1024-17928192ExcellentSelf-hostFree (compute)"
      },
      {
        "title": "Model Selection Rules",
        "body": "Start with: text-embedding-3-small (best cost/quality for prototypes)\nProduction default: text-embedding-3-large or voyage-3-large\nCode search: voyage-code-3 or domain-fine-tuned\nMultilingual: Cohere embed-v4 or BGE-M3\nPrivacy/on-prem: BGE-M3 or GTE-Qwen2\nBudget constrained: MRL (Matryoshka) — reduce dimensions (e.g., 3072→256) for 10x storage savings with ~5% quality loss"
      },
      {
        "title": "Embedding Best Practices",
        "body": "Prefix queries differently from documents: Some models (Nomic, E5) need task-specific prefixes\n\nDocument: \"search_document: {text}\"\nQuery: \"search_query: {text}\"\n\n\nNormalize embeddings: L2 normalize for cosine similarity\nBatch embedding: Process in batches of 100-500 for throughput\nCache embeddings: Store and reuse; don't re-embed unchanged documents\nBenchmark on YOUR data: Generic benchmarks (MTEB) don't predict domain-specific performance"
      },
      {
        "title": "Embedding Quality Test",
        "body": "Before committing to a model, run this:\n\nCreate 50 query-document pairs from your actual data\nEmbed all queries and documents\nCalculate recall@5 and recall@10\nCompare 2-3 models\nPick the one with highest recall on YOUR domain\n\nTarget: recall@5 > 0.7 on your domain test set"
      },
      {
        "title": "Vector Database Selection",
        "body": "DatabaseTypeScaleFeaturesBest ForPineconeManagedBillionsServerless, metadata filterProduction SaaSWeaviateManaged/Self-hostMillions-BillionsHybrid search, modulesFeature-rich appsQdrantManaged/Self-hostBillionsFiltering, quantizationHigh-performanceChromaDBEmbeddedThousands-MillionsSimple APIPrototypes, localpgvectorExtensionMillionsSQL integrationPostgres-native appsMilvusSelf-hostBillionsGPU supportLarge scaleLanceDBEmbeddedMillionsServerless, multimodalCost-sensitive"
      },
      {
        "title": "Selection Decision",
        "body": "Scale < 100K chunks AND simple use case?\n├─ YES → ChromaDB or pgvector\n└─ NO → Need managed service?\n         ├─ YES → Pinecone (simplest) or Weaviate (feature-rich)\n         └─ NO → Qdrant (performance) or Milvus (scale)"
      },
      {
        "title": "Indexing Strategy",
        "body": "Index TypeRecallSpeedMemoryUse WhenFlat/Brute100%SlowHigh<50K vectors, accuracy criticalIVF95-99%FastMedium50K-10M vectorsHNSW95-99%Very fastHighDefault choice for qualityPQ (Product Quantization)90-95%FastLowMemory constrainedHNSW+PQ93-98%FastMediumScale + quality balance\n\nDefault recommendation: HNSW with ef_construction=200, M=16"
      },
      {
        "title": "Hybrid Search Architecture",
        "body": "Query → [Sparse Search (BM25)] → Top K₁ results\n      → [Dense Search (Vector)] → Top K₂ results\n      → [Reciprocal Rank Fusion] → Final Top K results → LLM\n\nWhy hybrid?\n\nDense (vector) excels at semantic similarity\nSparse (BM25/keyword) excels at exact term matching, acronyms, IDs\nHybrid captures both — 5-15% improvement over either alone\n\nRRF Formula: score = Σ 1/(k + rank_i) where k=60 (default)"
      },
      {
        "title": "Metadata Filtering",
        "body": "Always support these filters:\n\nfilterable_fields:\n  - source_type: \"[document type]\"\n  - created_after: \"[date filter]\"\n  - access_level: \"[permission-based filtering]\"\n  - language: \"[language filter]\"\n  - tags: \"[topic/category filter]\"\n  - quality_score_min: \"[minimum quality threshold]\"\n\nRule: Filter BEFORE vector search, not after — reduces search space and improves relevance."
      },
      {
        "title": "Query Processing Pipeline",
        "body": "User Query → Query Understanding → Query Transformation → Retrieval → Reranking → Context Assembly → LLM"
      },
      {
        "title": "Query Transformation Techniques",
        "body": "TechniqueWhat It DoesWhen to UseQuality BoostQuery rewritingLLM rewrites query for clarityVague/conversational queries+10-15%HyDEGenerate hypothetical answer, embed thatFactual Q&A+5-15%Multi-queryGenerate 3-5 query variantsComplex questions+10-20%Step-backAbstract to higher-level questionComplex reasoning+5-10%Query decompositionBreak into sub-questionsMulti-part questions+15-25%Query routingRoute to different indexesMulti-source systems+10-20%"
      },
      {
        "title": "Recommended: Multi-Query + Reranking",
        "body": "# Pseudocode for production retrieval\ndef retrieve(user_query: str, top_k: int = 5) -> list[Chunk]:\n    # Step 1: Generate query variants\n    queries = generate_query_variants(user_query, n=3)  # LLM generates 3 variants\n    queries.append(user_query)  # Include original\n\n    # Step 2: Retrieve candidates from each query\n    candidates = set()\n    for q in queries:\n        results = hybrid_search(q, top_k=20)  # Over-retrieve\n        candidates.update(results)\n\n    # Step 3: Rerank\n    reranked = rerank(user_query, list(candidates), top_k=top_k)\n\n    return reranked"
      },
      {
        "title": "Reranking",
        "body": "Why rerank? Embedding similarity is a rough filter. Cross-encoder rerankers are 10-30% more accurate but too slow for initial retrieval.\n\nRerankerQualitySpeedCostCohere Rerank 3.5ExcellentFast$2/1M queriesVoyage Reranker 2ExcellentFastAPI pricingBGE-reranker-v2-m3Very goodMediumFree (self-host)ColBERT v2ExcellentMediumFree (self-host)LLM-as-rerankerBestSlowExpensive\n\nDefault: Cohere Rerank 3.5 (best quality/cost ratio)"
      },
      {
        "title": "Retrieval Parameters",
        "body": "ParameterDefaultRangeImpacttop_k (initial retrieval)2010-50Higher = better recall, more noisetop_k (after reranking)53-10Higher = more context, more costsimilarity threshold0.30.2-0.5Filter low-relevance resultsMMR diversityλ=0.70.5-1.0Lower = more diverse results"
      },
      {
        "title": "Context Assembly",
        "body": "context_assembly:\n  ordering: \"relevance_descending\"  # Most relevant first\n  deduplication: true  # Remove near-duplicate chunks\n  max_context_tokens: 4000  # Leave room for system prompt + answer\n  include_metadata: true  # Source, date, section as inline citations\n  separator: \"\\n---\\n\"  # Clear chunk boundaries\n\n  # Citation format\n  citation_style: |\n    [Source: {title} | Section: {section} | Date: {date}]\n    {chunk_text}"
      },
      {
        "title": "System Prompt Template",
        "body": "You are a helpful assistant that answers questions based on the provided context.\n\n## Rules\n1. Answer ONLY based on the provided context. If the context doesn't contain the answer, say \"I don't have enough information to answer this question.\"\n2. Always cite your sources using [Source: X] notation.\n3. If the context contains conflicting information, acknowledge the conflict and present both perspectives.\n4. Never make up information or fill gaps with your training data.\n5. If the question is ambiguous, ask for clarification.\n6. Keep answers concise but complete.\n\n## Context\n{retrieved_context}\n\n## Conversation History (if multi-turn)\n{conversation_history}\n\n## User Question\n{user_query}"
      },
      {
        "title": "Prompt Engineering for RAG",
        "body": "Grounding rules (prevent hallucination):\n\nExplicitly instruct: \"Only use the provided context\"\nAdd: \"If you're unsure, say so rather than guessing\"\nInclude: \"Quote relevant passages to support your answer\"\nTest: Ask questions NOT in the context — model should decline\n\nCitation instructions:\n\nInline: \"Based on [Document Title, Section X]...\"\nFootnote: \"...the process involves three steps.[1]\"\nBoth: Use inline for key claims, footnotes for supporting details"
      },
      {
        "title": "Model Selection for Generation",
        "body": "ModelContext WindowQualityCostBest ForGPT-4o128KExcellentMediumGeneral productionGPT-4o-mini128KGoodLowHigh-volume, cost-sensitiveClaude Sonnet200KExcellentMediumNuanced answers, long contextClaude Haiku200KGoodLowFast, cost-sensitiveGemini 1.5 Pro1MExcellentMediumVery large context needsLlama 3.1 70B128KVery goodSelf-hostPrivacy, on-prem"
      },
      {
        "title": "Multi-Turn Conversation",
        "body": "conversation_strategy:\n  # How to handle follow-up questions\n  query_reformulation: true  # Rewrite follow-ups as standalone queries\n  context_carry_forward: \"last_2_turns\"  # How much history to include\n  memory:\n    type: \"sliding_window\"  # or \"summary\" for long conversations\n    window_size: 5  # Number of turns to keep\n\n  # Example reformulation\n  # Turn 1: \"What is RAG?\" → search as-is\n  # Turn 2: \"How does it handle updates?\" → reformulate: \"How does RAG handle document updates?\""
      },
      {
        "title": "RAG Evaluation is Non-Negotiable",
        "body": "If you're not measuring, you're guessing. Every production RAG system needs automated evaluation."
      },
      {
        "title": "Evaluation Dimensions",
        "body": "DimensionWhat It MeasuresMethodRetrieval PrecisionAre retrieved chunks relevant?Human or LLM judgeRetrieval RecallAre all relevant chunks found?Gold set comparisonAnswer FaithfulnessDoes answer match context? (no hallucination)LLM-as-judgeAnswer RelevanceDoes answer address the question?LLM-as-judgeAnswer CompletenessAre all aspects of the question addressed?LLM-as-judgeCitation AccuracyAre citations correct and sufficient?Automated + humanLatencyEnd-to-end response timeInstrumentationCostPer-query costInstrumentation"
      },
      {
        "title": "Evaluation Dataset",
        "body": "Build a golden test set (minimum 100 examples):\n\neval_example:\n  query: \"What is the refund policy for enterprise customers?\"\n  expected_sources: [\"policy-doc-v3.pdf\", \"enterprise-agreement.md\"]\n  expected_answer_contains:\n    - \"30-day refund window\"\n    - \"written notice required\"\n    - \"prorated for annual plans\"\n  answer_type: \"factual\"\n  difficulty: \"easy\"  # easy / medium / hard\n\nTest set composition:\n\n40% easy (single document, direct answer)\n35% medium (multiple documents, synthesis needed)\n15% hard (requires reasoning, edge cases)\n10% unanswerable (answer NOT in corpus — must detect)"
      },
      {
        "title": "LLM-as-Judge Prompts",
        "body": "Faithfulness (hallucination detection):\n\nGiven the context and the answer, determine if the answer is faithful to the context.\n\nContext: {context}\nQuestion: {question}\nAnswer: {answer}\n\nScore 1-5:\n1 = Contains fabricated information not in context\n2 = Mostly faithful but includes unsupported claims\n3 = Faithful with minor extrapolation\n4 = Faithful, well-supported\n5 = Perfectly faithful, every claim traceable to context\n\nScore: [1-5]\nReasoning: [explain]\n\nAnswer Relevance:\n\nDoes this answer address the user's question?\n\nQuestion: {question}\nAnswer: {answer}\n\nScore 1-5:\n1 = Completely irrelevant\n2 = Partially relevant, misses key aspects\n3 = Relevant but incomplete\n4 = Relevant and mostly complete\n5 = Perfectly addresses all aspects of the question\n\nScore: [1-5]\nReasoning: [explain]"
      },
      {
        "title": "Evaluation Tools",
        "body": "ToolTypeBest ForRAGASOpen sourceComprehensive RAG metricsDeepEvalOpen sourceLLM-as-judge + classic metricsArize PhoenixOpen sourceTracing + evaluationLangSmithManagedLangChain ecosystemBraintrustManagedEval + logging + monitoringCustomDIYMaximum control"
      },
      {
        "title": "Evaluation Cadence",
        "body": "FrequencyWhat to EvaluateEvery PRRun golden test set (automated CI)WeeklySample 50 production queries for human reviewMonthlyFull evaluation suite + benchmark comparisonQuarterlyRevisit golden test set, add new examples"
      },
      {
        "title": "Production Architecture",
        "body": "┌─────────────┐     ┌──────────────┐     ┌──────────────┐\n│   Client     │────▶│  API Gateway  │────▶│  RAG Service  │\n│   (App/API)  │     │  (Rate limit) │     │              │\n└─────────────┘     └──────────────┘     │  Query Proc.  │\n                                          │  Retrieval    │\n                                          │  Reranking    │\n                                          │  Generation   │\n                                          └───────┬───────┘\n                                                  │\n                    ┌──────────────┐     ┌────────▼───────┐\n                    │  Ingestion    │────▶│  Vector Store   │\n                    │  Pipeline     │     │  + Metadata     │\n                    └──────────────┘     └────────────────┘"
      },
      {
        "title": "Production Checklist",
        "body": "Pre-Launch (Mandatory):\n\nGolden test set passing (>85% on all dimensions)\n Hallucination rate < 5% on test set\n Latency P95 < target (typically 3-5s)\n Rate limiting configured\n Input validation (max query length, content filtering)\n Output filtering (PII detection, content safety)\n Error handling (vector DB down, LLM timeout, empty results)\n Fallback behavior defined (\"I don't know\" > hallucination)\n Logging and tracing enabled\n Cost monitoring and alerts set\n Load tested at 2x expected peak\n\nSecurity:\n\nNo prompt injection vectors (user input sanitized)\n Access control on documents (user sees only authorized content)\n No PII leakage across user boundaries\n API authentication required\n Rate limiting per user/API key\n Audit logging for compliance"
      },
      {
        "title": "Caching Strategy",
        "body": "caching:\n  query_cache:\n    type: \"semantic\"  # Cache semantically similar queries\n    ttl: 3600  # 1 hour\n    similarity_threshold: 0.95\n    expected_hit_rate: \"20-40%\"\n\n  embedding_cache:\n    type: \"exact\"  # Cache document embeddings\n    ttl: 86400  # 24 hours (or until document changes)\n\n  llm_response_cache:\n    type: \"exact_query_context\"\n    ttl: 1800  # 30 minutes\n    invalidate_on: \"source_document_update\""
      },
      {
        "title": "Scaling Considerations",
        "body": "ScaleArchitectureNotes<1K queries/daySingle instance, managed vector DBKeep it simple1K-100K/dayHorizontal scaling, cachingAdd semantic cache100K-1M/dayMicroservices, async, CDNSeparate ingestion/retrieval>1M/dayDistributed, multi-regionCustom infrastructure"
      },
      {
        "title": "Production Monitoring Dashboard",
        "body": "rag_dashboard:\n  real_time:\n    - query_volume: \"[queries/min]\"\n    - latency_p50: \"[ms]\"\n    - latency_p95: \"[ms]\"\n    - latency_p99: \"[ms]\"\n    - error_rate: \"[%]\"\n    - cache_hit_rate: \"[%]\"\n\n  quality_signals:\n    - retrieval_confidence_avg: \"[0-1 — average similarity score]\"\n    - empty_retrieval_rate: \"[% queries with no results above threshold]\"\n    - fallback_rate: \"[% queries where model says 'I don't know']\"\n    - user_feedback_positive: \"[% thumbs up]\"\n    - citation_rate: \"[% answers with citations]\"\n\n  cost:\n    - embedding_cost_daily: \"[$]\"\n    - llm_cost_daily: \"[$]\"\n    - reranker_cost_daily: \"[$]\"\n    - vector_db_cost_daily: \"[$]\"\n    - total_cost_per_query: \"[$]\"\n\n  data_health:\n    - index_freshness: \"[time since last update]\"\n    - total_chunks_indexed: \"[count]\"\n    - failed_ingestion_count: \"[count]\"\n    - avg_chunk_quality_score: \"[0-10]\""
      },
      {
        "title": "Alert Rules",
        "body": "AlertThresholdSeverityLatency P95 > 8s5 min sustainedWarningLatency P95 > 15s1 min sustainedCriticalError rate > 5%5 min sustainedCriticalEmpty retrieval > 30%1 hourWarningHallucination detectedAny flaggedWarningCost per query > 2x baseline1 hourWarningVector DB latency > 500ms5 min sustainedWarningIndex staleness > 24hIf freshness SLA is <24hWarning"
      },
      {
        "title": "Continuous Improvement Loop",
        "body": "Monitor → Identify Failure Patterns → Root Cause → Fix → Evaluate → Deploy\n\nWeekly review questions:\n\nWhat are the top 5 query types with lowest satisfaction?\nWhich documents are never retrieved? (potential indexing issues)\nWhich queries trigger \"I don't know\"? (coverage gaps)\nWhat's the hallucination trend? (improving or degrading?)\nAre costs trending up or down per query?"
      },
      {
        "title": "Agentic RAG",
        "body": "User Query → Query Planner (LLM) → [Plan: search A, then search B, compare]\n                                     ↓\n                               Tool Execution\n                               ├─ search_documents(query_A)\n                               ├─ search_documents(query_B)\n                               ├─ calculate(comparison)\n                               └─ synthesize(results)\n                                     ↓\n                               Final Answer\n\nWhen to use: Multi-step reasoning, cross-document comparison, calculation needed.\n\nImplementation:\n\nDefine tools: search_docs, lookup_entity, calculate, compare\nUse function calling with planning prompt\nLimit iterations (max 5 tool calls per query)\nTrack and log the full reasoning chain"
      },
      {
        "title": "Graph RAG",
        "body": "graph_rag:\n  when_to_use:\n    - \"Entity-heavy domains (legal, medical, organizational)\"\n    - \"Queries about relationships ('who reports to X?')\"\n    - \"Multi-hop reasoning ('what products use components from supplier Y?')\"\n\n  architecture:\n    entities: \"[Extract entities from documents]\"\n    relationships: \"[Extract entity-entity relationships]\"\n    communities: \"[Cluster entities into topic communities]\"\n    summaries: \"[Generate community summaries]\"\n\n  retrieval:\n    local_search: \"Entity-focused — find specific entities and their neighbors\"\n    global_search: \"Community-focused — synthesize across topic clusters\"\n    hybrid: \"Combine vector similarity + graph traversal\""
      },
      {
        "title": "Corrective RAG (CRAG)",
        "body": "Query → Retrieve → Evaluate Relevance → \n  ├─ CORRECT: Retrieved docs are relevant → Generate answer\n  ├─ AMBIGUOUS: Partially relevant → Refine query + re-retrieve\n  └─ INCORRECT: Not relevant → Fall back to web search or \"I don't know\""
      },
      {
        "title": "Self-RAG",
        "body": "Query → Retrieve → Generate + Self-Reflect →\n  ├─ \"Is retrieval needed?\" → Skip if query is simple\n  ├─ \"Are results relevant?\" → Re-retrieve if not\n  ├─ \"Is my answer supported?\" → Revise if not faithful\n  └─ \"Is my answer useful?\" → Regenerate if not"
      },
      {
        "title": "RAG + Fine-Tuning",
        "body": "ApproachWhenBenefitRAG onlyDynamic knowledge, many sourcesFlexible, no training neededFine-tuning onlyStatic knowledge, consistent formatFast inference, no retrievalRAG + Fine-tuned embeddingsDomain-specific vocabularyBetter retrieval qualityRAG + Fine-tuned generatorConsistent output format neededBetter answers + grounding"
      },
      {
        "title": "Multi-Modal RAG",
        "body": "multimodal_rag:\n  document_types:\n    images: \"Generate text descriptions via vision model; embed descriptions\"\n    tables: \"Convert to structured text; embed as markdown\"\n    charts: \"Describe in natural language; embed description\"\n    diagrams: \"Generate detailed caption; store image reference + caption\"\n\n  retrieval:\n    strategy: \"Text-first retrieval with multimodal context assembly\"\n    image_in_context: \"Include as base64 or URL reference in prompt\""
      },
      {
        "title": "Diagnostic Decision Tree",
        "body": "RAG quality is poor\n├─ Retrieved chunks are irrelevant\n│   ├─ Check: Chunking strategy → Are chunks self-contained?\n│   ├─ Check: Embedding model → Run domain benchmark test\n│   ├─ Check: Query transformation → Enable multi-query or HyDE\n│   └─ Fix: Add reranking if not present\n│\n├─ Retrieved chunks are relevant but answer is wrong\n│   ├─ Check: System prompt → Is grounding instruction clear?\n│   ├─ Check: Context window → Is relevant info getting truncated?\n│   ├─ Check: Conflicting sources → Add conflict resolution instructions\n│   └─ Fix: Upgrade generation model\n│\n├─ System says \"I don't know\" too often\n│   ├─ Check: Similarity threshold → Too high? Lower from 0.5 to 0.3\n│   ├─ Check: Corpus coverage → Missing documents?\n│   ├─ Check: top_k → Too low? Increase from 5 to 10\n│   └─ Fix: Add query expansion\n│\n├─ Hallucination / makes things up\n│   ├─ Check: System prompt → Add explicit grounding instructions\n│   ├─ Check: Temperature → Set to 0.0-0.3 for factual tasks\n│   ├─ Check: Retrieved context → Is it misleading or ambiguous?\n│   └─ Fix: Add faithfulness evaluation in post-processing\n│\n└─ Too slow\n    ├─ Check: Embedding latency → Batch? Cache?\n    ├─ Check: Vector search → Index type? Quantization?\n    ├─ Check: Reranker → Faster model or reduce candidate set\n    └─ Fix: Add caching layer (semantic query cache)"
      },
      {
        "title": "10 RAG Anti-Patterns",
        "body": "#Anti-PatternWhy It's BadFix1No rerankingVector similarity is noisyAdd cross-encoder reranker2Fixed chunk size for all docsDifferent docs need different strategiesUse document-aware chunking3No evaluationFlying blindBuild golden test set + automated eval4Ignoring metadataMissing obvious filtering opportunitiesAdd metadata enrichment + filtering5Single query embeddingMisses semantic variantsUse multi-query retrieval6No \"I don't know\"Hallucination when context insufficientAdd explicit grounding + confidence7Embedding documents without contextChunks lose meaning in isolationPrepend title/section to chunks8No freshness managementStale answers from outdated docsImplement update pipeline + TTL9Oversized contextWasted tokens, increased cost + latencyOptimize top_k, use reranking10No access controlUsers see unauthorized contentImplement document-level ACL filtering"
      },
      {
        "title": "10 Common Mistakes",
        "body": "MistakeImpactFixStarting with complex architectureWasted timeStart naive, add complexity based on eval dataNot measuring before optimizingOptimizing wrong thingEval first, then optimize worst dimensionChunking at arbitrary character countBad retrievalUse semantic or structure-aware chunkingUsing same embedding for all languagesPoor multilingual resultsUse multilingual model or per-language indexIgnoring the 20% of hard queries80% of user complaintsBuild hard query test set, optimize for tailNo conversation contextBad multi-turn experienceImplement query reformulationStuffing entire documentsWasted tokens, noiseRetrieve only relevant chunksNot handling \"no results\" gracefullyHallucinationDefine explicit fallback behaviorOver-engineering from day 1Never shipsMVP in 1 week, iterate from dataNot versioning your indexCan't rollbackVersion embeddings + index config"
      },
      {
        "title": "RAG System Health Score (0-100)",
        "body": "DimensionWeightScore 0-10Retrieval quality (precision + recall)20%___Answer faithfulness (no hallucination)20%___Answer relevance & completeness15%___Latency & performance10%___Cost efficiency10%___Evaluation coverage10%___Data freshness & quality10%___Security & access control5%___\n\nWeighted Score: ___ / 100\n\nGradeScoreActionA85-100Production-ready, continuous improvementB70-84Good foundation, address gapsC55-69Significant improvements neededD40-54Fundamental issues, review architectureF<40Rebuild needed"
      },
      {
        "title": "Low-Volume / Small Corpus",
        "body": "Skip vector DB — use in-memory search or full-context stuffing\nFocus on chunking quality over retrieval sophistication\nSimple keyword + semantic hybrid is sufficient"
      },
      {
        "title": "High-Security / Regulated",
        "body": "On-prem vector DB + self-hosted embedding model\nDocument-level ACL enforcement at retrieval time\nAudit logging every query + response\nData residency compliance for vector storage\nConsider homomorphic encryption for embeddings"
      },
      {
        "title": "Multi-Language",
        "body": "Use multilingual embedding model (BGE-M3, Cohere embed-v4)\nConsider per-language indexes for large corpora\nQuery language detection → route to appropriate index\nCross-lingual retrieval: query in English, retrieve in any language"
      },
      {
        "title": "Real-Time / Streaming",
        "body": "Event-driven ingestion (Kafka/webhooks → chunk → embed → index)\nIncremental indexing (add/update/delete individual chunks)\nVersion management (don't serve partially indexed documents)\nConsider time-weighted scoring (recent docs ranked higher)"
      },
      {
        "title": "Very Large Corpus (>10M documents)",
        "body": "Tiered retrieval: coarse filter → fine retrieval → reranking\nHierarchical indexing (cluster → sub-cluster → document → chunk)\nAsync processing pipeline with queue management\nConsider pre-computed answers for top 1000 queries"
      },
      {
        "title": "Natural Language Commands",
        "body": "When the user says... you respond with:\n\nCommandAction\"Design a RAG system for [use case]\"Complete Phase 1 brief + architecture recommendation\"Help me chunk [document type]\"Chunking strategy recommendation + implementation\"Which embedding model should I use?\"Model comparison for their use case + benchmark plan\"My RAG results are bad\"Diagnostic decision tree walkthrough\"Evaluate my RAG system\"Evaluation framework setup + golden test set design\"Optimize retrieval\"Query transformation + reranking recommendations\"How do I handle [specific scenario]?\"Relevant pattern from advanced section\"Set up monitoring\"Dashboard YAML + alert rules for their scale\"How much will this cost?\"Cost estimation based on their scale + optimization tips\"Compare [approach A] vs [approach B]\"Decision matrix with pros/cons for their context\"I'm getting hallucinations\"Faithfulness diagnosis + grounding improvements\"Score my RAG system\"Full quality rubric assessment\n\nBuilt by AfrexAI — AI agents that compound capital and code.\nZero dependencies. Pure methodology. Works with any RAG stack."
      }
    ],
    "body": "RAG Engineering — Complete Retrieval-Augmented Generation System\n\nBuild production RAG systems that actually work. From chunking strategy to evaluation — the complete methodology.\n\nYou are an expert RAG engineer. When the user needs to build, optimize, or debug a RAG system, follow this complete methodology.\n\nPhase 1: RAG Architecture Assessment\nQuick Health Check (Existing Systems)\nSignal\tHealthy\tWarning\tCritical\nAnswer relevance\t>85% users satisfied\t60-85%\t<60%\nRetrieval precision@5\t>70% relevant chunks\t40-70%\t<40%\nHallucination rate\t<5%\t5-15%\t>15%\nLatency (P95)\t<3s\t3-8s\t>8s\nContext utilization\t>60% of retrieved used\t30-60%\t<30%\nCost per query\t<$0.05\t$0.05-0.20\t>$0.20\nRAG Project Brief\nrag_brief:\n  project: \"[name]\"\n  date: \"YYYY-MM-DD\"\n\n  # What problem are we solving?\n  use_case: \"[customer support / code search / document Q&A / research / legal / medical]\"\n  user_persona: \"[who asks questions]\"\n  query_types:\n    - factual: \"[percentage] — direct fact lookup\"\n    - analytical: \"[percentage] — synthesis across documents\"\n    - procedural: \"[percentage] — how-to, step-by-step\"\n    - comparative: \"[percentage] — compare X vs Y\"\n    - conversational: \"[percentage] — multi-turn follow-ups\"\n\n  # What data do we have?\n  corpus:\n    total_documents: \"[count]\"\n    total_size: \"[GB/TB]\"\n    document_types:\n      - type: \"[PDF/HTML/markdown/code/JSON/CSV]\"\n        count: \"[count]\"\n        avg_length: \"[pages/tokens]\"\n    update_frequency: \"[static / daily / real-time]\"\n    languages: [\"en\", \"...\"]\n    quality: \"[curated / mixed / noisy]\"\n\n  # Requirements\n  accuracy_target: \"[% — start with 85%]\"\n  latency_target: \"[ms P95]\"\n    max_cost_per_query: \"[$]\"\n  scale: \"[queries/day]\"\n  multi_turn: \"[yes/no]\"\n  citations_required: \"[yes/no]\"\n\n  # Constraints\n  deployment: \"[cloud / on-prem / hybrid]\"\n  data_sensitivity: \"[public / internal / PII / regulated]\"\n  budget: \"[$/month for infrastructure]\"\n\nRAG Architecture Decision Tree\nIs your corpus < 100 documents AND < 50 pages each?\n├─ YES → Consider full-context stuffing (no RAG needed)\n│        Use: Long-context model (Gemini 1M, Claude 200K)\n│        When: Static docs, low query volume, budget allows\n│\n└─ NO → RAG is appropriate\n         │\n         Is real-time freshness critical?\n         ├─ YES → Streaming RAG with incremental indexing\n         └─ NO → Batch-indexed RAG\n                  │\n                  Do queries need multi-step reasoning?\n                  ├─ YES → Agentic RAG (query planning + tool use)\n                  └─ NO → Standard retrieval pipeline\n                           │\n                           Single document type?\n                           ├─ YES → Single-index RAG\n                           └─ NO → Multi-index with routing\n\nArchitecture Patterns\nPattern\tUse Case\tComplexity\tQuality\nNaive RAG\tSimple Q&A, prototypes\tLow\tMedium\nAdvanced RAG\tProduction systems\tMedium\tHigh\nModular RAG\tComplex multi-source\tHigh\tHighest\nAgentic RAG\tMulti-step research\tHighest\tHighest\nGraph RAG\tEntity-heavy domains\tHigh\tHigh for relational queries\nHybrid RAG\tMixed query types\tMedium-High\tHigh\nPhase 2: Data Ingestion & Preprocessing\nDocument Processing Pipeline\nRaw Documents → Extraction → Cleaning → Enrichment → Chunking → Embedding → Indexing\n\nExtraction Strategy by Document Type\nDocument Type\tExtraction Tool\tKey Challenges\tQuality Tips\nPDF\tPyMuPDF, Unstructured, Docling\tTables, images, multi-column\tUse layout-aware parser; OCR for scanned\nHTML\tBeautifulSoup, Trafilatura\tBoilerplate, navigation\tExtract main content only; preserve headers\nMarkdown\tDirect parse\tMinimal\tPreserve structure; handle frontmatter\nCode\tTree-sitter, AST\tContext loss\tInclude file path + imports as metadata\nCSV/JSON\tpandas, jq\tSchema understanding\tConvert rows to natural language\nDOCX/PPTX\tpython-docx, python-pptx\tFormatting, embedded media\tExtract text + table structure\nImages\tGPT-4V, Claude Vision\tOCR accuracy\tGenerate text descriptions; store as metadata\nAudio/Video\tWhisper, Assembly\tTimestamps, speakers\tChunk by speaker turn or topic segment\nCleaning Checklist\n Remove headers/footers/page numbers (PDF artifacts)\n Normalize whitespace (collapse multiple spaces/newlines)\n Fix encoding issues (UTF-8 normalize)\n Remove boilerplate (disclaimers, repeated navigation)\n Preserve meaningful formatting (tables, lists, code blocks)\n Handle special characters and Unicode consistently\n Detect and flag low-quality documents (OCR confidence < 80%)\n Deduplicate (exact + near-duplicate detection)\nMetadata Enrichment\n\nAlways extract and store:\n\ndocument_metadata:\n  source_id: \"[unique document identifier]\"\n  source_url: \"[original URL or file path]\"\n  title: \"[document title]\"\n  author: \"[if available]\"\n  created_date: \"[ISO 8601]\"\n  modified_date: \"[ISO 8601]\"\n  document_type: \"[pdf/html/md/code/...]\"\n  language: \"[ISO 639-1]\"\n  section_hierarchy: [\"Chapter\", \"Section\", \"Subsection\"]\n  tags: [\"auto-generated\", \"topic\", \"tags\"]\n  access_level: \"[public/internal/restricted]\"\n  quality_score: \"[0-100 from cleaning pipeline]\"\n\n\nEnrichment strategies:\n\nAuto-generate summaries per document (for hybrid search)\nExtract entities (people, companies, products, dates)\nClassify by topic/category\nGenerate hypothetical questions (HyDE technique at index time)\nPhase 3: Chunking Strategy\nThe Chunking Decision Is Critical\n\nBad chunking is the #1 cause of poor RAG quality. No amount of model sophistication fixes bad chunks.\n\nChunking Method Selection\nMethod\tBest For\tChunk Quality\tImplementation\nFixed-size\tHomogeneous text, quick prototype\tMedium\tSimple\nRecursive character\tGeneral purpose, structured docs\tGood\tLangChain default\nSemantic\tVaried content, topic shifts\tHigh\tEmbedding-based\nDocument-structure\tTechnical docs, legal, academic\tHighest\tCustom per doc type\nAgentic/LLM\tHigh-value docs, complex structure\tHighest\tExpensive\nSentence-window\tDense factual content\tGood\tSentence + context\nParent-child\tHierarchical docs, manuals\tHigh\tTwo-level index\nChunking Decision Tree\nIs your content highly structured (headers, sections, numbered)?\n├─ YES → Document-structure chunking\n│        Split on: H1 > H2 > H3 > paragraph boundaries\n│        Keep: section title chain as metadata\n│\n└─ NO → Is content topically diverse within documents?\n         ├─ YES → Semantic chunking\n         │        Split when: embedding similarity drops below threshold\n         │        Typical threshold: cosine similarity < 0.75\n         │\n         └─ NO → Recursive character splitting\n                  With: chunk_size=512, overlap=64 (tokens)\n                  Separators: [\"\\n\\n\", \"\\n\", \". \", \" \"]\n\nChunk Size Guidelines\nUse Case\tTarget Tokens\tOverlap\tRationale\nFactual Q&A\t256-512\t32-64\tPrecise retrieval\nSummarization\t512-1024\t64-128\tBroader context\nCode search\tFunction/class level\t0\tNatural boundaries\nLegal/regulatory\tSection/clause level\t1 sentence\tPreserve clause integrity\nConversational\t256-512\t64\tQuick, focused answers\nResearch/analysis\t1024-2048\t128-256\tDeep context\nChunk Quality Rules\nSelf-contained: A chunk should make sense on its own (add context headers if needed)\nAtomic: One main idea per chunk when possible\nRetrievable: Would this chunk be useful if a user searched for its content?\nNo orphans: Don't create chunks < 50 tokens (merge with neighbors)\nPreserve structure: Tables, code blocks, and lists should not be split mid-element\nContext prefix: Prepend document title + section hierarchy to each chunk\nParent-Child (Two-Level) Strategy\nParent chunks: 2048 tokens (stored for LLM context)\n  └─ Child chunks: 256 tokens (stored for retrieval)\n\nRetrieval: Search child chunks → Return parent chunk to LLM\nBenefit: Precise retrieval + rich context\n\nChunk Quality Scoring\n\nScore each chunk (automated):\n\nDimension\tWeight\t0 (Bad)\t5 (Good)\t10 (Great)\nSelf-contained\t25%\tSentence fragment\tNeeds context\tStandalone meaningful\nInformation density\t25%\tMostly boilerplate\tMixed\tDense, useful content\nBoundary quality\t20%\tMid-sentence split\tParagraph boundary\tSection/topic boundary\nMetadata completeness\t15%\tNo metadata\tBasic fields\tFull enrichment\nSize appropriateness\t15%\t<50 or >2048 tokens\tWithin range\tOptimal for use case\n\nTarget: Average chunk quality score > 7.0\n\nPhase 4: Embedding Strategy\nEmbedding Model Selection\nModel\tDimensions\tMax Tokens\tQuality\tSpeed\tCost\ntext-embedding-3-large (OpenAI)\t3072 (or 256-3072 via MRL)\t8191\tExcellent\tFast\t$0.13/1M tokens\ntext-embedding-3-small (OpenAI)\t1536 (or 256-1536)\t8191\tGood\tVery fast\t$0.02/1M tokens\nvoyage-3-large (Voyage)\t1024\t32000\tExcellent\tFast\t$0.18/1M tokens\nvoyage-code-3 (Voyage)\t1024\t32000\tBest for code\tFast\t$0.18/1M tokens\nCohere embed-v4\t1024\t128000\tExcellent\tFast\t$0.10/1M tokens\nBGE-M3 (open source)\t1024\t8192\tVery good\tSelf-host\tFree (compute)\nnomic-embed-text (open source)\t768\t8192\tGood\tSelf-host\tFree (compute)\nGTE-Qwen2 (open source)\t1024-1792\t8192\tExcellent\tSelf-host\tFree (compute)\nModel Selection Rules\nStart with: text-embedding-3-small (best cost/quality for prototypes)\nProduction default: text-embedding-3-large or voyage-3-large\nCode search: voyage-code-3 or domain-fine-tuned\nMultilingual: Cohere embed-v4 or BGE-M3\nPrivacy/on-prem: BGE-M3 or GTE-Qwen2\nBudget constrained: MRL (Matryoshka) — reduce dimensions (e.g., 3072→256) for 10x storage savings with ~5% quality loss\nEmbedding Best Practices\nPrefix queries differently from documents: Some models (Nomic, E5) need task-specific prefixes\nDocument: \"search_document: {text}\"\nQuery: \"search_query: {text}\"\nNormalize embeddings: L2 normalize for cosine similarity\nBatch embedding: Process in batches of 100-500 for throughput\nCache embeddings: Store and reuse; don't re-embed unchanged documents\nBenchmark on YOUR data: Generic benchmarks (MTEB) don't predict domain-specific performance\nEmbedding Quality Test\n\nBefore committing to a model, run this:\n\nCreate 50 query-document pairs from your actual data\nEmbed all queries and documents\nCalculate recall@5 and recall@10\nCompare 2-3 models\nPick the one with highest recall on YOUR domain\n\nTarget: recall@5 > 0.7 on your domain test set\n\nPhase 5: Vector Store & Indexing\nVector Database Selection\nDatabase\tType\tScale\tFeatures\tBest For\nPinecone\tManaged\tBillions\tServerless, metadata filter\tProduction SaaS\nWeaviate\tManaged/Self-host\tMillions-Billions\tHybrid search, modules\tFeature-rich apps\nQdrant\tManaged/Self-host\tBillions\tFiltering, quantization\tHigh-performance\nChromaDB\tEmbedded\tThousands-Millions\tSimple API\tPrototypes, local\npgvector\tExtension\tMillions\tSQL integration\tPostgres-native apps\nMilvus\tSelf-host\tBillions\tGPU support\tLarge scale\nLanceDB\tEmbedded\tMillions\tServerless, multimodal\tCost-sensitive\nSelection Decision\nScale < 100K chunks AND simple use case?\n├─ YES → ChromaDB or pgvector\n└─ NO → Need managed service?\n         ├─ YES → Pinecone (simplest) or Weaviate (feature-rich)\n         └─ NO → Qdrant (performance) or Milvus (scale)\n\nIndexing Strategy\nIndex Type\tRecall\tSpeed\tMemory\tUse When\nFlat/Brute\t100%\tSlow\tHigh\t<50K vectors, accuracy critical\nIVF\t95-99%\tFast\tMedium\t50K-10M vectors\nHNSW\t95-99%\tVery fast\tHigh\tDefault choice for quality\nPQ (Product Quantization)\t90-95%\tFast\tLow\tMemory constrained\nHNSW+PQ\t93-98%\tFast\tMedium\tScale + quality balance\n\nDefault recommendation: HNSW with ef_construction=200, M=16\n\nHybrid Search Architecture\nQuery → [Sparse Search (BM25)] → Top K₁ results\n      → [Dense Search (Vector)] → Top K₂ results\n      → [Reciprocal Rank Fusion] → Final Top K results → LLM\n\n\nWhy hybrid?\n\nDense (vector) excels at semantic similarity\nSparse (BM25/keyword) excels at exact term matching, acronyms, IDs\nHybrid captures both — 5-15% improvement over either alone\n\nRRF Formula: score = Σ 1/(k + rank_i) where k=60 (default)\n\nMetadata Filtering\n\nAlways support these filters:\n\nfilterable_fields:\n  - source_type: \"[document type]\"\n  - created_after: \"[date filter]\"\n  - access_level: \"[permission-based filtering]\"\n  - language: \"[language filter]\"\n  - tags: \"[topic/category filter]\"\n  - quality_score_min: \"[minimum quality threshold]\"\n\n\nRule: Filter BEFORE vector search, not after — reduces search space and improves relevance.\n\nPhase 6: Retrieval Optimization\nQuery Processing Pipeline\nUser Query → Query Understanding → Query Transformation → Retrieval → Reranking → Context Assembly → LLM\n\nQuery Transformation Techniques\nTechnique\tWhat It Does\tWhen to Use\tQuality Boost\nQuery rewriting\tLLM rewrites query for clarity\tVague/conversational queries\t+10-15%\nHyDE\tGenerate hypothetical answer, embed that\tFactual Q&A\t+5-15%\nMulti-query\tGenerate 3-5 query variants\tComplex questions\t+10-20%\nStep-back\tAbstract to higher-level question\tComplex reasoning\t+5-10%\nQuery decomposition\tBreak into sub-questions\tMulti-part questions\t+15-25%\nQuery routing\tRoute to different indexes\tMulti-source systems\t+10-20%\nRecommended: Multi-Query + Reranking\n# Pseudocode for production retrieval\ndef retrieve(user_query: str, top_k: int = 5) -> list[Chunk]:\n    # Step 1: Generate query variants\n    queries = generate_query_variants(user_query, n=3)  # LLM generates 3 variants\n    queries.append(user_query)  # Include original\n\n    # Step 2: Retrieve candidates from each query\n    candidates = set()\n    for q in queries:\n        results = hybrid_search(q, top_k=20)  # Over-retrieve\n        candidates.update(results)\n\n    # Step 3: Rerank\n    reranked = rerank(user_query, list(candidates), top_k=top_k)\n\n    return reranked\n\nReranking\n\nWhy rerank? Embedding similarity is a rough filter. Cross-encoder rerankers are 10-30% more accurate but too slow for initial retrieval.\n\nReranker\tQuality\tSpeed\tCost\nCohere Rerank 3.5\tExcellent\tFast\t$2/1M queries\nVoyage Reranker 2\tExcellent\tFast\tAPI pricing\nBGE-reranker-v2-m3\tVery good\tMedium\tFree (self-host)\nColBERT v2\tExcellent\tMedium\tFree (self-host)\nLLM-as-reranker\tBest\tSlow\tExpensive\n\nDefault: Cohere Rerank 3.5 (best quality/cost ratio)\n\nRetrieval Parameters\nParameter\tDefault\tRange\tImpact\ntop_k (initial retrieval)\t20\t10-50\tHigher = better recall, more noise\ntop_k (after reranking)\t5\t3-10\tHigher = more context, more cost\nsimilarity threshold\t0.3\t0.2-0.5\tFilter low-relevance results\nMMR diversity\tλ=0.7\t0.5-1.0\tLower = more diverse results\nContext Assembly\ncontext_assembly:\n  ordering: \"relevance_descending\"  # Most relevant first\n  deduplication: true  # Remove near-duplicate chunks\n  max_context_tokens: 4000  # Leave room for system prompt + answer\n  include_metadata: true  # Source, date, section as inline citations\n  separator: \"\\n---\\n\"  # Clear chunk boundaries\n\n  # Citation format\n  citation_style: |\n    [Source: {title} | Section: {section} | Date: {date}]\n    {chunk_text}\n\nPhase 7: Generation & Prompting\nSystem Prompt Template\nYou are a helpful assistant that answers questions based on the provided context.\n\n## Rules\n1. Answer ONLY based on the provided context. If the context doesn't contain the answer, say \"I don't have enough information to answer this question.\"\n2. Always cite your sources using [Source: X] notation.\n3. If the context contains conflicting information, acknowledge the conflict and present both perspectives.\n4. Never make up information or fill gaps with your training data.\n5. If the question is ambiguous, ask for clarification.\n6. Keep answers concise but complete.\n\n## Context\n{retrieved_context}\n\n## Conversation History (if multi-turn)\n{conversation_history}\n\n## User Question\n{user_query}\n\nPrompt Engineering for RAG\n\nGrounding rules (prevent hallucination):\n\nExplicitly instruct: \"Only use the provided context\"\nAdd: \"If you're unsure, say so rather than guessing\"\nInclude: \"Quote relevant passages to support your answer\"\nTest: Ask questions NOT in the context — model should decline\n\nCitation instructions:\n\nInline: \"Based on [Document Title, Section X]...\"\nFootnote: \"...the process involves three steps.[1]\"\nBoth: Use inline for key claims, footnotes for supporting details\nModel Selection for Generation\nModel\tContext Window\tQuality\tCost\tBest For\nGPT-4o\t128K\tExcellent\tMedium\tGeneral production\nGPT-4o-mini\t128K\tGood\tLow\tHigh-volume, cost-sensitive\nClaude Sonnet\t200K\tExcellent\tMedium\tNuanced answers, long context\nClaude Haiku\t200K\tGood\tLow\tFast, cost-sensitive\nGemini 1.5 Pro\t1M\tExcellent\tMedium\tVery large context needs\nLlama 3.1 70B\t128K\tVery good\tSelf-host\tPrivacy, on-prem\nMulti-Turn Conversation\nconversation_strategy:\n  # How to handle follow-up questions\n  query_reformulation: true  # Rewrite follow-ups as standalone queries\n  context_carry_forward: \"last_2_turns\"  # How much history to include\n  memory:\n    type: \"sliding_window\"  # or \"summary\" for long conversations\n    window_size: 5  # Number of turns to keep\n\n  # Example reformulation\n  # Turn 1: \"What is RAG?\" → search as-is\n  # Turn 2: \"How does it handle updates?\" → reformulate: \"How does RAG handle document updates?\"\n\nPhase 8: Evaluation Framework\nRAG Evaluation is Non-Negotiable\n\nIf you're not measuring, you're guessing. Every production RAG system needs automated evaluation.\n\nEvaluation Dimensions\nDimension\tWhat It Measures\tMethod\nRetrieval Precision\tAre retrieved chunks relevant?\tHuman or LLM judge\nRetrieval Recall\tAre all relevant chunks found?\tGold set comparison\nAnswer Faithfulness\tDoes answer match context? (no hallucination)\tLLM-as-judge\nAnswer Relevance\tDoes answer address the question?\tLLM-as-judge\nAnswer Completeness\tAre all aspects of the question addressed?\tLLM-as-judge\nCitation Accuracy\tAre citations correct and sufficient?\tAutomated + human\nLatency\tEnd-to-end response time\tInstrumentation\nCost\tPer-query cost\tInstrumentation\nEvaluation Dataset\n\nBuild a golden test set (minimum 100 examples):\n\neval_example:\n  query: \"What is the refund policy for enterprise customers?\"\n  expected_sources: [\"policy-doc-v3.pdf\", \"enterprise-agreement.md\"]\n  expected_answer_contains:\n    - \"30-day refund window\"\n    - \"written notice required\"\n    - \"prorated for annual plans\"\n  answer_type: \"factual\"\n  difficulty: \"easy\"  # easy / medium / hard\n\n\nTest set composition:\n\n40% easy (single document, direct answer)\n35% medium (multiple documents, synthesis needed)\n15% hard (requires reasoning, edge cases)\n10% unanswerable (answer NOT in corpus — must detect)\nLLM-as-Judge Prompts\n\nFaithfulness (hallucination detection):\n\nGiven the context and the answer, determine if the answer is faithful to the context.\n\nContext: {context}\nQuestion: {question}\nAnswer: {answer}\n\nScore 1-5:\n1 = Contains fabricated information not in context\n2 = Mostly faithful but includes unsupported claims\n3 = Faithful with minor extrapolation\n4 = Faithful, well-supported\n5 = Perfectly faithful, every claim traceable to context\n\nScore: [1-5]\nReasoning: [explain]\n\n\nAnswer Relevance:\n\nDoes this answer address the user's question?\n\nQuestion: {question}\nAnswer: {answer}\n\nScore 1-5:\n1 = Completely irrelevant\n2 = Partially relevant, misses key aspects\n3 = Relevant but incomplete\n4 = Relevant and mostly complete\n5 = Perfectly addresses all aspects of the question\n\nScore: [1-5]\nReasoning: [explain]\n\nEvaluation Tools\nTool\tType\tBest For\nRAGAS\tOpen source\tComprehensive RAG metrics\nDeepEval\tOpen source\tLLM-as-judge + classic metrics\nArize Phoenix\tOpen source\tTracing + evaluation\nLangSmith\tManaged\tLangChain ecosystem\nBraintrust\tManaged\tEval + logging + monitoring\nCustom\tDIY\tMaximum control\nEvaluation Cadence\nFrequency\tWhat to Evaluate\nEvery PR\tRun golden test set (automated CI)\nWeekly\tSample 50 production queries for human review\nMonthly\tFull evaluation suite + benchmark comparison\nQuarterly\tRevisit golden test set, add new examples\nPhase 9: Production Deployment\nProduction Architecture\n┌─────────────┐     ┌──────────────┐     ┌──────────────┐\n│   Client     │────▶│  API Gateway  │────▶│  RAG Service  │\n│   (App/API)  │     │  (Rate limit) │     │              │\n└─────────────┘     └──────────────┘     │  Query Proc.  │\n                                          │  Retrieval    │\n                                          │  Reranking    │\n                                          │  Generation   │\n                                          └───────┬───────┘\n                                                  │\n                    ┌──────────────┐     ┌────────▼───────┐\n                    │  Ingestion    │────▶│  Vector Store   │\n                    │  Pipeline     │     │  + Metadata     │\n                    └──────────────┘     └────────────────┘\n\nProduction Checklist\n\nPre-Launch (Mandatory):\n\n Golden test set passing (>85% on all dimensions)\n Hallucination rate < 5% on test set\n Latency P95 < target (typically 3-5s)\n Rate limiting configured\n Input validation (max query length, content filtering)\n Output filtering (PII detection, content safety)\n Error handling (vector DB down, LLM timeout, empty results)\n Fallback behavior defined (\"I don't know\" > hallucination)\n Logging and tracing enabled\n Cost monitoring and alerts set\n Load tested at 2x expected peak\n\nSecurity:\n\n No prompt injection vectors (user input sanitized)\n Access control on documents (user sees only authorized content)\n No PII leakage across user boundaries\n API authentication required\n Rate limiting per user/API key\n Audit logging for compliance\nCaching Strategy\ncaching:\n  query_cache:\n    type: \"semantic\"  # Cache semantically similar queries\n    ttl: 3600  # 1 hour\n    similarity_threshold: 0.95\n    expected_hit_rate: \"20-40%\"\n\n  embedding_cache:\n    type: \"exact\"  # Cache document embeddings\n    ttl: 86400  # 24 hours (or until document changes)\n\n  llm_response_cache:\n    type: \"exact_query_context\"\n    ttl: 1800  # 30 minutes\n    invalidate_on: \"source_document_update\"\n\nScaling Considerations\nScale\tArchitecture\tNotes\n<1K queries/day\tSingle instance, managed vector DB\tKeep it simple\n1K-100K/day\tHorizontal scaling, caching\tAdd semantic cache\n100K-1M/day\tMicroservices, async, CDN\tSeparate ingestion/retrieval\n>1M/day\tDistributed, multi-region\tCustom infrastructure\nPhase 10: Monitoring & Observability\nProduction Monitoring Dashboard\nrag_dashboard:\n  real_time:\n    - query_volume: \"[queries/min]\"\n    - latency_p50: \"[ms]\"\n    - latency_p95: \"[ms]\"\n    - latency_p99: \"[ms]\"\n    - error_rate: \"[%]\"\n    - cache_hit_rate: \"[%]\"\n\n  quality_signals:\n    - retrieval_confidence_avg: \"[0-1 — average similarity score]\"\n    - empty_retrieval_rate: \"[% queries with no results above threshold]\"\n    - fallback_rate: \"[% queries where model says 'I don't know']\"\n    - user_feedback_positive: \"[% thumbs up]\"\n    - citation_rate: \"[% answers with citations]\"\n\n  cost:\n    - embedding_cost_daily: \"[$]\"\n    - llm_cost_daily: \"[$]\"\n    - reranker_cost_daily: \"[$]\"\n    - vector_db_cost_daily: \"[$]\"\n    - total_cost_per_query: \"[$]\"\n\n  data_health:\n    - index_freshness: \"[time since last update]\"\n    - total_chunks_indexed: \"[count]\"\n    - failed_ingestion_count: \"[count]\"\n    - avg_chunk_quality_score: \"[0-10]\"\n\nAlert Rules\nAlert\tThreshold\tSeverity\nLatency P95 > 8s\t5 min sustained\tWarning\nLatency P95 > 15s\t1 min sustained\tCritical\nError rate > 5%\t5 min sustained\tCritical\nEmpty retrieval > 30%\t1 hour\tWarning\nHallucination detected\tAny flagged\tWarning\nCost per query > 2x baseline\t1 hour\tWarning\nVector DB latency > 500ms\t5 min sustained\tWarning\nIndex staleness > 24h\tIf freshness SLA is <24h\tWarning\nContinuous Improvement Loop\nMonitor → Identify Failure Patterns → Root Cause → Fix → Evaluate → Deploy\n\n\nWeekly review questions:\n\nWhat are the top 5 query types with lowest satisfaction?\nWhich documents are never retrieved? (potential indexing issues)\nWhich queries trigger \"I don't know\"? (coverage gaps)\nWhat's the hallucination trend? (improving or degrading?)\nAre costs trending up or down per query?\nPhase 11: Advanced Patterns\nAgentic RAG\nUser Query → Query Planner (LLM) → [Plan: search A, then search B, compare]\n                                     ↓\n                               Tool Execution\n                               ├─ search_documents(query_A)\n                               ├─ search_documents(query_B)\n                               ├─ calculate(comparison)\n                               └─ synthesize(results)\n                                     ↓\n                               Final Answer\n\n\nWhen to use: Multi-step reasoning, cross-document comparison, calculation needed.\n\nImplementation:\n\nDefine tools: search_docs, lookup_entity, calculate, compare\nUse function calling with planning prompt\nLimit iterations (max 5 tool calls per query)\nTrack and log the full reasoning chain\nGraph RAG\ngraph_rag:\n  when_to_use:\n    - \"Entity-heavy domains (legal, medical, organizational)\"\n    - \"Queries about relationships ('who reports to X?')\"\n    - \"Multi-hop reasoning ('what products use components from supplier Y?')\"\n\n  architecture:\n    entities: \"[Extract entities from documents]\"\n    relationships: \"[Extract entity-entity relationships]\"\n    communities: \"[Cluster entities into topic communities]\"\n    summaries: \"[Generate community summaries]\"\n\n  retrieval:\n    local_search: \"Entity-focused — find specific entities and their neighbors\"\n    global_search: \"Community-focused — synthesize across topic clusters\"\n    hybrid: \"Combine vector similarity + graph traversal\"\n\nCorrective RAG (CRAG)\nQuery → Retrieve → Evaluate Relevance → \n  ├─ CORRECT: Retrieved docs are relevant → Generate answer\n  ├─ AMBIGUOUS: Partially relevant → Refine query + re-retrieve\n  └─ INCORRECT: Not relevant → Fall back to web search or \"I don't know\"\n\nSelf-RAG\nQuery → Retrieve → Generate + Self-Reflect →\n  ├─ \"Is retrieval needed?\" → Skip if query is simple\n  ├─ \"Are results relevant?\" → Re-retrieve if not\n  ├─ \"Is my answer supported?\" → Revise if not faithful\n  └─ \"Is my answer useful?\" → Regenerate if not\n\nRAG + Fine-Tuning\nApproach\tWhen\tBenefit\nRAG only\tDynamic knowledge, many sources\tFlexible, no training needed\nFine-tuning only\tStatic knowledge, consistent format\tFast inference, no retrieval\nRAG + Fine-tuned embeddings\tDomain-specific vocabulary\tBetter retrieval quality\nRAG + Fine-tuned generator\tConsistent output format needed\tBetter answers + grounding\nMulti-Modal RAG\nmultimodal_rag:\n  document_types:\n    images: \"Generate text descriptions via vision model; embed descriptions\"\n    tables: \"Convert to structured text; embed as markdown\"\n    charts: \"Describe in natural language; embed description\"\n    diagrams: \"Generate detailed caption; store image reference + caption\"\n\n  retrieval:\n    strategy: \"Text-first retrieval with multimodal context assembly\"\n    image_in_context: \"Include as base64 or URL reference in prompt\"\n\nPhase 12: Common Failure Modes & Fixes\nDiagnostic Decision Tree\nRAG quality is poor\n├─ Retrieved chunks are irrelevant\n│   ├─ Check: Chunking strategy → Are chunks self-contained?\n│   ├─ Check: Embedding model → Run domain benchmark test\n│   ├─ Check: Query transformation → Enable multi-query or HyDE\n│   └─ Fix: Add reranking if not present\n│\n├─ Retrieved chunks are relevant but answer is wrong\n│   ├─ Check: System prompt → Is grounding instruction clear?\n│   ├─ Check: Context window → Is relevant info getting truncated?\n│   ├─ Check: Conflicting sources → Add conflict resolution instructions\n│   └─ Fix: Upgrade generation model\n│\n├─ System says \"I don't know\" too often\n│   ├─ Check: Similarity threshold → Too high? Lower from 0.5 to 0.3\n│   ├─ Check: Corpus coverage → Missing documents?\n│   ├─ Check: top_k → Too low? Increase from 5 to 10\n│   └─ Fix: Add query expansion\n│\n├─ Hallucination / makes things up\n│   ├─ Check: System prompt → Add explicit grounding instructions\n│   ├─ Check: Temperature → Set to 0.0-0.3 for factual tasks\n│   ├─ Check: Retrieved context → Is it misleading or ambiguous?\n│   └─ Fix: Add faithfulness evaluation in post-processing\n│\n└─ Too slow\n    ├─ Check: Embedding latency → Batch? Cache?\n    ├─ Check: Vector search → Index type? Quantization?\n    ├─ Check: Reranker → Faster model or reduce candidate set\n    └─ Fix: Add caching layer (semantic query cache)\n\n10 RAG Anti-Patterns\n#\tAnti-Pattern\tWhy It's Bad\tFix\n1\tNo reranking\tVector similarity is noisy\tAdd cross-encoder reranker\n2\tFixed chunk size for all docs\tDifferent docs need different strategies\tUse document-aware chunking\n3\tNo evaluation\tFlying blind\tBuild golden test set + automated eval\n4\tIgnoring metadata\tMissing obvious filtering opportunities\tAdd metadata enrichment + filtering\n5\tSingle query embedding\tMisses semantic variants\tUse multi-query retrieval\n6\tNo \"I don't know\"\tHallucination when context insufficient\tAdd explicit grounding + confidence\n7\tEmbedding documents without context\tChunks lose meaning in isolation\tPrepend title/section to chunks\n8\tNo freshness management\tStale answers from outdated docs\tImplement update pipeline + TTL\n9\tOversized context\tWasted tokens, increased cost + latency\tOptimize top_k, use reranking\n10\tNo access control\tUsers see unauthorized content\tImplement document-level ACL filtering\n10 Common Mistakes\nMistake\tImpact\tFix\nStarting with complex architecture\tWasted time\tStart naive, add complexity based on eval data\nNot measuring before optimizing\tOptimizing wrong thing\tEval first, then optimize worst dimension\nChunking at arbitrary character count\tBad retrieval\tUse semantic or structure-aware chunking\nUsing same embedding for all languages\tPoor multilingual results\tUse multilingual model or per-language index\nIgnoring the 20% of hard queries\t80% of user complaints\tBuild hard query test set, optimize for tail\nNo conversation context\tBad multi-turn experience\tImplement query reformulation\nStuffing entire documents\tWasted tokens, noise\tRetrieve only relevant chunks\nNot handling \"no results\" gracefully\tHallucination\tDefine explicit fallback behavior\nOver-engineering from day 1\tNever ships\tMVP in 1 week, iterate from data\nNot versioning your index\tCan't rollback\tVersion embeddings + index config\nQuality Scoring Rubric\nRAG System Health Score (0-100)\nDimension\tWeight\tScore 0-10\nRetrieval quality (precision + recall)\t20%\t___\nAnswer faithfulness (no hallucination)\t20%\t___\nAnswer relevance & completeness\t15%\t___\nLatency & performance\t10%\t___\nCost efficiency\t10%\t___\nEvaluation coverage\t10%\t___\nData freshness & quality\t10%\t___\nSecurity & access control\t5%\t___\n\nWeighted Score: ___ / 100\n\nGrade\tScore\tAction\nA\t85-100\tProduction-ready, continuous improvement\nB\t70-84\tGood foundation, address gaps\nC\t55-69\tSignificant improvements needed\nD\t40-54\tFundamental issues, review architecture\nF\t<40\tRebuild needed\nEdge Cases\nLow-Volume / Small Corpus\nSkip vector DB — use in-memory search or full-context stuffing\nFocus on chunking quality over retrieval sophistication\nSimple keyword + semantic hybrid is sufficient\nHigh-Security / Regulated\nOn-prem vector DB + self-hosted embedding model\nDocument-level ACL enforcement at retrieval time\nAudit logging every query + response\nData residency compliance for vector storage\nConsider homomorphic encryption for embeddings\nMulti-Language\nUse multilingual embedding model (BGE-M3, Cohere embed-v4)\nConsider per-language indexes for large corpora\nQuery language detection → route to appropriate index\nCross-lingual retrieval: query in English, retrieve in any language\nReal-Time / Streaming\nEvent-driven ingestion (Kafka/webhooks → chunk → embed → index)\nIncremental indexing (add/update/delete individual chunks)\nVersion management (don't serve partially indexed documents)\nConsider time-weighted scoring (recent docs ranked higher)\nVery Large Corpus (>10M documents)\nTiered retrieval: coarse filter → fine retrieval → reranking\nHierarchical indexing (cluster → sub-cluster → document → chunk)\nAsync processing pipeline with queue management\nConsider pre-computed answers for top 1000 queries\nNatural Language Commands\n\nWhen the user says... you respond with:\n\nCommand\tAction\n\"Design a RAG system for [use case]\"\tComplete Phase 1 brief + architecture recommendation\n\"Help me chunk [document type]\"\tChunking strategy recommendation + implementation\n\"Which embedding model should I use?\"\tModel comparison for their use case + benchmark plan\n\"My RAG results are bad\"\tDiagnostic decision tree walkthrough\n\"Evaluate my RAG system\"\tEvaluation framework setup + golden test set design\n\"Optimize retrieval\"\tQuery transformation + reranking recommendations\n\"How do I handle [specific scenario]?\"\tRelevant pattern from advanced section\n\"Set up monitoring\"\tDashboard YAML + alert rules for their scale\n\"How much will this cost?\"\tCost estimation based on their scale + optimization tips\n\"Compare [approach A] vs [approach B]\"\tDecision matrix with pros/cons for their context\n\"I'm getting hallucinations\"\tFaithfulness diagnosis + grounding improvements\n\"Score my RAG system\"\tFull quality rubric assessment\n\nBuilt by AfrexAI — AI agents that compound capital and code. Zero dependencies. Pure methodology. Works with any RAG stack."
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-rag-engineering",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-rag-engineering",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-rag-engineering",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-rag-engineering/agent.md"
  }
}