{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-ml-engineering",
    "name": "ML Engineering",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-ml-engineering",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-ml-engineering",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-ml-engineering",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-ml-engineering",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-ml-engineering"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-ml-engineering",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "ML & AI Engineering System",
        "body": "Complete methodology for building, deploying, and operating production ML/AI systems — from experiment to scale."
      },
      {
        "title": "Phase 1: Problem Framing",
        "body": "Before writing any code, define the ML problem precisely."
      },
      {
        "title": "ML Problem Brief",
        "body": "problem_brief:\n  business_objective: \"\"          # What business metric improves?\n  success_metric: \"\"              # Quantified target (e.g., \"reduce churn 15%\")\n  baseline: \"\"                    # Current performance without ML\n  ml_task_type: \"\"                # classification | regression | ranking | generation | clustering | anomaly_detection | recommendation\n  prediction_target: \"\"           # What exactly are we predicting?\n  prediction_consumer: \"\"         # Who/what uses the prediction? (API | dashboard | email | automated action)\n  latency_requirement: \"\"         # real-time (<100ms) | near-real-time (<1s) | batch (minutes-hours)\n  data_available: \"\"              # What data exists today?\n  data_gaps: \"\"                   # What's missing?\n  ethical_considerations: \"\"      # Bias risks, fairness requirements, privacy\n  kill_criteria:                  # When to abandon the ML approach\n    - \"Baseline heuristic achieves >90% of ML performance\"\n    - \"Data quality too poor after 2 weeks of cleaning\"\n    - \"Model can't beat random by >10% on holdout set\""
      },
      {
        "title": "ML vs Rules Decision",
        "body": "SignalUse RulesUse MLLogic is explainable in <10 rules✅❌Pattern is too complex for humans❌✅Training data >1,000 labeled examples—✅Needs to adapt to new patterns❌✅Must be 100% auditable/deterministic✅❌Pattern changes faster than you can update rules❌✅\n\nRule of thumb: Start with rules/heuristics. Only add ML when rules fail to capture the pattern."
      },
      {
        "title": "Data Quality Assessment",
        "body": "Score each data source (0-5 per dimension):\n\nDimension0 (Terrible)5 (Excellent)Completeness>50% missing<1% missingAccuracyKnown errors, no validationValidated against source of truthConsistencyDifferent formats, duplicatesStandardized, deduplicatedTimelinessMonths staleReal-time or daily refreshRelevanceWeak proxy for targetDirect signal for predictionVolume<100 samples>10,000 samples per class\n\nMinimum score to proceed: 18/30. Below 18 → fix data first, don't build models."
      },
      {
        "title": "Feature Engineering Patterns",
        "body": "feature_types:\n  numerical:\n    - raw_value           # Use as-is if normally distributed\n    - log_transform       # Right-skewed distributions (revenue, counts)\n    - standardize         # z-score for algorithms sensitive to scale (SVM, KNN, neural nets)\n    - bin_to_categorical  # When relationship is non-linear and data is limited\n  categorical:\n    - one_hot             # <20 categories, tree-based models handle natively\n    - target_encoding     # High-cardinality (>20 categories), use with K-fold to prevent leakage\n    - embedding           # Very high-cardinality (user IDs, product IDs) with deep learning\n  temporal:\n    - lag_features        # Value at t-1, t-7, t-30\n    - rolling_statistics  # Mean, std, min, max over windows\n    - time_since_event    # Days since last purchase, hours since login\n    - cyclical_encoding   # sin/cos for hour-of-day, day-of-week, month\n  text:\n    - tfidf               # Simple, interpretable, good baseline\n    - sentence_embeddings # semantic similarity, modern NLP\n    - llm_extraction      # Use LLM to extract structured fields from unstructured text\n  interaction:\n    - ratios              # Feature A / Feature B (e.g., clicks/impressions = CTR)\n    - differences         # Feature A - Feature B (e.g., price - competitor_price)\n    - polynomial          # A * B, A^2 (use sparingly, high-cardinality features)"
      },
      {
        "title": "Feature Store Design",
        "body": "feature_store:\n  offline_store:         # For training — batch computed, stored in data warehouse\n    storage: \"BigQuery | Snowflake | S3+Parquet\"\n    compute: \"Spark | dbt | SQL\"\n    refresh: \"daily | hourly\"\n  online_store:          # For serving — low-latency lookups\n    storage: \"Redis | DynamoDB | Feast online\"\n    latency_target: \"<10ms p99\"\n    refresh: \"streaming | near-real-time\"\n  registry:              # Feature metadata\n    naming: \"{entity}_{feature_name}_{window}_{aggregation}\"  # e.g., user_purchase_count_30d_sum\n    documentation:\n      - description\n      - data_type\n      - source_table\n      - owner\n      - created_date\n      - known_issues"
      },
      {
        "title": "Data Leakage Prevention Checklist",
        "body": "No future information in features (time-travel check)\n Train/val/test split done BEFORE feature engineering\n Target encoding uses only training fold statistics\n No features derived from the target variable\n Temporal splits for time-series (no random shuffle)\n Holdout set created BEFORE any EDA\n Duplicates removed BEFORE splitting (same entity not in train AND test)\n Normalization/scaling fit on train, applied to val/test"
      },
      {
        "title": "Experiment Tracking Template",
        "body": "experiment:\n  id: \"EXP-{YYYY-MM-DD}-{NNN}\"\n  hypothesis: \"\"                 # \"Adding user tenure features will improve churn prediction AUC by >2%\"\n  dataset_version: \"\"            # Hash or version of training data\n  features_used: []              # List of feature names\n  model_type: \"\"                 # Algorithm name\n  hyperparameters: {}            # All hyperparams logged\n  training_time: \"\"              # Wall clock\n  metrics:\n    primary: {}                  # The one metric that matters\n    secondary: {}                # Supporting metrics\n  baseline_comparison: \"\"        # Delta vs baseline\n  verdict: \"promoted | archived | iterate\"\n  notes: \"\"\n  artifacts:\n    - model_path: \"\"\n    - notebook_path: \"\"\n    - confusion_matrix: \"\""
      },
      {
        "title": "Model Selection Guide",
        "body": "TaskStart WithScale ToAvoidTabular classificationXGBoost/LightGBMNeural nets only if >100K samplesDeep learning on <10K samplesTabular regressionXGBoost/LightGBMCatBoost for high-cardinality catsLinear regression without feature engineeringImage classificationFine-tune ResNet/EfficientNetVision Transformer if >100K imagesTraining from scratchText classificationFine-tune BERT/RoBERTaLLM few-shot if labeled data scarceBag-of-words for nuanced tasksText generationGPT-4/Claude APIFine-tuned Llama/Mistral for costTraining from scratchTime seriesProphet/ARIMA baseline → LightGBMTemporal Fusion TransformerLSTM without strong reasonRecommendationCollaborative filtering baselineTwo-tower neuralComplex models on <1K usersAnomaly detectionIsolation ForestAutoencoder if high-dimensionalSupervised methods without labeled anomaliesSearch/rankingBM25 baseline → Learning to RankCross-encoder rerankingPure keyword without semantic"
      },
      {
        "title": "Hyperparameter Tuning Strategy",
        "body": "Manual first — understand 3-5 most impactful parameters\nBayesian optimization (Optuna) — 50-100 trials for production models\nGrid search — only for final fine-tuning of 2-3 parameters\nRandom search — better than grid for >4 parameters\n\nKey hyperparameters by model:\n\nModelCritical ParamsTypical RangeXGBoostlearning_rate, max_depth, n_estimators, min_child_weight0.01-0.3, 3-10, 100-1000, 1-10LightGBMlearning_rate, num_leaves, feature_fraction, min_data_in_leaf0.01-0.3, 15-255, 0.5-1.0, 5-100Neural Netlearning_rate, batch_size, hidden_dims, dropout1e-5 to 1e-2, 32-512, arch-dependent, 0.1-0.5Random Forestn_estimators, max_depth, min_samples_leaf100-1000, 5-30, 1-20"
      },
      {
        "title": "Metric Selection by Task",
        "body": "TaskPrimary MetricWhen to UseWatch Out ForBinary classification (balanced)F1-scoreEqual importance of precision/recall—Binary classification (imbalanced)PR-AUCRare positive class (<5%)ROC-AUC hides poor performance on minorityMulti-classMacro F1All classes equally importantMicro F1 if class frequency = importanceRegressionMAEOutliers should not dominateRMSE penalizes large errors moreRankingNDCG@KTop-K results matter mostMAP if binary relevanceGenerationHuman eval + automatedQuality is subjectiveBLEU/ROUGE alone are insufficientAnomaly detectionPrecision@KFalse positives are expensiveRecall if missing anomalies is dangerous"
      },
      {
        "title": "Evaluation Rigor Checklist",
        "body": "Metrics computed on TRUE holdout (never seen during training OR tuning)\n Cross-validation for small datasets (<10K samples)\n Stratified splits for imbalanced classes\n Temporal split for time-dependent data\n Confidence intervals reported (bootstrap or cross-val)\n Performance broken down by important segments (geography, user cohort, etc.)\n Fairness metrics across protected groups\n Comparison against simple baseline (majority class, mean prediction, rules)\n Error analysis: examined top 50 worst predictions manually\n Calibration plot for probabilistic predictions"
      },
      {
        "title": "Offline-to-Online Gap Analysis",
        "body": "Before deploying, verify these don't cause train-serving skew:\n\nCheckOfflineOnlineActionFeature computationBatch SQLReal-time APIVerify same logic, test with replayData freshnessPoint-in-time snapshotLatest valueDocument acceptable stalenessMissing valuesImputed in pipelineMay be truly missingHandle gracefully in servingFeature distributionsTraining periodCurrent periodMonitor drift post-deploy"
      },
      {
        "title": "Deployment Pattern Decision Tree",
        "body": "Is latency < 100ms required?\n├── Yes → Is model < 500MB?\n│   ├── Yes → Embedded serving (FastAPI + model in memory)\n│   └── No → Model server (Triton, TorchServe, vLLM)\n└── No → Is it a batch prediction?\n    ├── Yes → Batch pipeline (Spark, Airflow + offline inference)\n    └── No → Async queue (Celery/SQS → worker → result store)"
      },
      {
        "title": "Production Serving Checklist",
        "body": "serving_config:\n  model:\n    format: \"\"                    # ONNX | TorchScript | SavedModel | safetensors\n    version: \"\"                   # Semantic version\n    size_mb: null\n    load_time_seconds: null\n  infrastructure:\n    compute: \"\"                   # CPU | GPU (T4/A10/A100/H100)\n    instances: null               # Min/max for autoscaling\n    autoscale_metric: \"\"          # RPS | latency_p99 | GPU_utilization\n    autoscale_target: null\n  api:\n    endpoint: \"\"\n    input_schema: {}              # Pydantic model or JSON schema\n    output_schema: {}\n    timeout_ms: null\n    rate_limit: null\n  reliability:\n    health_check: \"/health\"\n    readiness_check: \"/ready\"     # Model loaded and warm\n    graceful_shutdown: true\n    circuit_breaker: true\n    fallback: \"\"                  # Rules-based fallback when model is down"
      },
      {
        "title": "Containerization Template",
        "body": "# Multi-stage build for minimal image\nFROM python:3.11-slim AS builder\nWORKDIR /app\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\n\nFROM python:3.11-slim\nWORKDIR /app\nCOPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages\nCOPY --from=builder /usr/local/bin /usr/local/bin\nCOPY model/ ./model/\nCOPY src/ ./src/\n\n# Non-root user\nRUN useradd -m appuser && chown -R appuser /app\nUSER appuser\n\n# Health check\nHEALTHCHECK --interval=30s --timeout=5s CMD curl -f http://localhost:8080/health || exit 1\n\nEXPOSE 8080\nCMD [\"uvicorn\", \"src.serve:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8080\"]"
      },
      {
        "title": "A/B Testing for Models",
        "body": "ab_test:\n  name: \"\"\n  hypothesis: \"\"\n  primary_metric: \"\"              # Business metric (revenue, engagement, etc.)\n  guardrail_metrics: []           # Metrics that must NOT degrade\n  traffic_split:\n    control: 50                   # Current model\n    treatment: 50                 # New model\n  minimum_sample_size: null       # Power analysis: use statsmodels or online calculator\n  minimum_runtime_days: null      # At least 1 full business cycle (7 days min)\n  decision_criteria:\n    ship: \"Treatment > control by >X% with p<0.05 AND no guardrail regression\"\n    iterate: \"Promising signal but not significant — extend test or refine model\"\n    kill: \"No improvement after 2x minimum runtime OR guardrail breach\""
      },
      {
        "title": "LLM Application Architecture",
        "body": "┌─────────────────────────────────────────────┐\n│              Application Layer               │\n│  (Prompt templates, chains, output parsers)  │\n├─────────────────────────────────────────────┤\n│              Orchestration Layer              │\n│  (Routing, fallback, retry, caching)         │\n├─────────────────────────────────────────────┤\n│              Model Layer                     │\n│  (API calls, fine-tuned models, embeddings)  │\n├─────────────────────────────────────────────┤\n│              Data Layer                      │\n│  (Vector store, context retrieval, memory)   │\n└─────────────────────────────────────────────┘"
      },
      {
        "title": "Model Selection for LLM Tasks",
        "body": "TaskBest OptionCost-Effective OptionWhen to Fine-TuneGeneral reasoningClaude Opus / GPT-4oClaude Sonnet / GPT-4o-miniNever for general reasoningClassificationFine-tuned small modelFew-shot with Sonnet>1,000 labeled examples + high volumeExtractionStructured output APIRegex + LLM fallbackConsistent format needed at scaleSummarizationClaude SonnetGPT-4o-miniDomain-specific style neededCode generationClaude SonnetCodestral / DeepSeekInternal codebase conventionsEmbeddingstext-embedding-3-largetext-embedding-3-smallDomain-specific vocab (medical, legal)"
      },
      {
        "title": "RAG System Architecture",
        "body": "rag_pipeline:\n  ingestion:\n    chunking:\n      strategy: \"semantic\"         # semantic | fixed_size | recursive\n      chunk_size: 512              # tokens (512-1024 for most use cases)\n      overlap: 50                  # tokens overlap between chunks\n      metadata_to_preserve:\n        - source_document\n        - page_number\n        - section_heading\n        - date_created\n    embedding:\n      model: \"text-embedding-3-large\"\n      dimensions: 1536             # Or 256/512 with Matryoshka for cost savings\n    vector_store: \"Pinecone | Weaviate | pgvector | Qdrant\"\n  retrieval:\n    strategy: \"hybrid\"             # dense | sparse | hybrid (recommended)\n    top_k: 10                      # Retrieve more, then rerank\n    reranking:\n      model: \"Cohere rerank | cross-encoder\"\n      top_n: 3                     # Final context chunks\n    filters: []                    # Metadata filters (date range, source, etc.)\n  generation:\n    model: \"\"\n    system_prompt: |\n      Answer based ONLY on the provided context.\n      If the context doesn't contain the answer, say \"I don't have enough information.\"\n      Cite sources using [Source: document_name, page X].\n    temperature: 0.1               # Low for factual, higher for creative\n    max_tokens: null"
      },
      {
        "title": "RAG Quality Checklist",
        "body": "Chunking preserves semantic meaning (not cutting mid-sentence)\n Metadata enables filtering (dates, sources, categories)\n Retrieval returns relevant chunks (test with 50+ queries manually)\n Reranking improves precision (compare with/without)\n System prompt prevents hallucination (tested with adversarial queries)\n Sources are cited and verifiable\n Handles \"I don't know\" gracefully\n Latency acceptable (<3s for interactive, <30s for complex)\n Cost per query tracked and within budget"
      },
      {
        "title": "LLM Cost Optimization",
        "body": "StrategySavingsTrade-offPrompt caching50-90% on repeated prefixesRequires cache-friendly prompt designModel routing (small → large)40-70%Slightly higher latency, need router logicBatch API50%Hours of delay, batch-only workloadsShorter promptsLinear with token reductionMay reduce qualityFine-tuned small model80-95% vs large model APITraining cost + maintenanceSemantic caching50-80% for similar queriesMay return stale/wrong cached resultOutput token limitsProportionalMay truncate useful information"
      },
      {
        "title": "Monitoring Dashboard",
        "body": "monitoring:\n  model_performance:\n    metrics:\n      - name: \"primary_metric\"         # Same as offline evaluation\n        threshold: null                 # Alert if below\n        window: \"1h | 1d | 7d\"\n      - name: \"prediction_distribution\"\n        alert: \"KL divergence > 0.1 from training distribution\"\n    latency:\n      p50_ms: null\n      p95_ms: null\n      p99_ms: null\n      alert_threshold_ms: null\n    throughput:\n      requests_per_second: null\n      error_rate_threshold: 0.01       # Alert if >1% errors\n  data_drift:\n    method: \"PSI | KS-test | JS-divergence\"\n    features_to_monitor: []            # Top 10 most important features\n    check_frequency: \"hourly | daily\"\n    alert_threshold: null              # PSI > 0.2 = significant drift\n  concept_drift:\n    method: \"performance_degradation\"\n    ground_truth_delay: \"\"             # How long until we get labels?\n    proxy_metrics: []                  # Metrics available before ground truth\n    retraining_trigger: \"\"             # When to retrain"
      },
      {
        "title": "Drift Response Playbook",
        "body": "Drift TypeDetectionSeverityResponseFeature drift (input distribution shifts)PSI > 0.1WarningInvestigate cause, monitor performanceFeature drift (PSI > 0.25)PSI > 0.25CriticalRetrain on recent data within 24hConcept drift (relationship changes)Performance drop >5%CriticalRetrain with new labels, review featuresLabel drift (target distribution changes)Chi-square testWarningVerify label quality, check for data issuesPrediction drift (output distribution shifts)KL divergenceWarningMay indicate upstream data issue"
      },
      {
        "title": "Automated Retraining Pipeline",
        "body": "retraining:\n  triggers:\n    - type: \"scheduled\"\n      frequency: \"weekly | monthly\"\n    - type: \"performance\"\n      condition: \"primary_metric < threshold for 24h\"\n    - type: \"drift\"\n      condition: \"PSI > 0.2 on any top-10 feature\"\n  pipeline:\n    1_data_validation:\n      - check_completeness\n      - check_distribution_shift\n      - check_label_quality\n    2_training:\n      - use_latest_N_months_data\n      - same_hyperparameters_as_production   # Unless scheduled tuning\n      - log_all_metrics\n    3_evaluation:\n      - compare_vs_production_model\n      - must_beat_production_on_primary_metric\n      - must_not_regress_on_guardrail_metrics\n      - evaluate_on_golden_test_set\n    4_deployment:\n      - canary_deployment: 5%\n      - monitor_for: \"4h minimum\"\n      - auto_rollback_if: \"error_rate > 2x baseline\"\n      - gradual_rollout: \"5% → 25% → 50% → 100%\"\n    5_notification:\n      - log_retraining_event\n      - notify_team_on_failure\n      - update_model_registry"
      },
      {
        "title": "ML Platform Components",
        "body": "ComponentPurposeToolsExperiment trackingLog runs, compare resultsMLflow, W&B, NeptuneFeature storeCentralized feature managementFeast, Tecton, HopsworksModel registryVersion, stage, approve modelsMLflow Registry, SageMakerPipeline orchestrationDAG-based ML workflowsAirflow, Prefect, Dagster, KubeflowModel servingLow-latency inferenceTriton, TorchServe, vLLM, BentoMLMonitoringDrift, performance, data qualityEvidently, Whylogs, Great ExpectationsVector storeEmbedding storage for RAGPinecone, Weaviate, pgvector, QdrantGPU managementTraining and inference computeK8s + GPU operator, RunPod, Modal"
      },
      {
        "title": "CI/CD for ML",
        "body": "ml_cicd:\n  on_code_change:\n    - lint_and_type_check\n    - unit_tests (data transforms, feature logic)\n    - integration_tests (pipeline end-to-end on sample data)\n  on_data_change:\n    - data_validation (Great Expectations / custom)\n    - feature_pipeline_run\n    - smoke_test_predictions\n  on_model_change:\n    - full_evaluation_suite\n    - bias_and_fairness_check\n    - performance_regression_test\n    - model_size_and_latency_check\n    - security_scan (model file, dependencies)\n    - staging_deployment\n    - integration_test_in_staging\n    - approval_gate (manual for major versions)\n    - canary_deployment"
      },
      {
        "title": "Model Registry Workflow",
        "body": "┌──────────────┐      ┌──────────────┐      ┌──────────────┐\n│  Development │ ───→ │   Staging    │ ───→ │  Production  │\n│              │      │              │      │              │\n│ - Experiment │      │ - Eval suite │      │ - Canary     │\n│ - Log metrics│      │ - Load test  │      │ - Monitor    │\n│ - Compare    │      │ - Approval   │      │ - Rollback   │\n└──────────────┘      └──────────────┘      └──────────────┘\n\nPromotion criteria:\n\nDev → Staging: Beats current production on offline metrics\nStaging → Production: Passes load test + integration test + human approval\nAuto-rollback: Error rate >2x OR latency >2x OR primary metric drops >5%"
      },
      {
        "title": "Bias Detection Checklist",
        "body": "Training data represents all demographic groups proportionally\n Performance metrics broken down by protected attributes\n Equal opportunity: similar true positive rates across groups\n Calibration: predicted probabilities match actual rates per group\n No proxy features for protected attributes (ZIP code → race)\n Fairness metric selected and threshold defined BEFORE training\n Disparate impact ratio >0.8 (80% rule)\n Edge cases tested: what happens with unusual inputs?"
      },
      {
        "title": "Model Card Template",
        "body": "model_card:\n  model_name: \"\"\n  version: \"\"\n  date: \"\"\n  owner: \"\"\n  description: \"\"\n  intended_use: \"\"\n  out_of_scope_uses: \"\"\n  training_data:\n    source: \"\"\n    size: \"\"\n    date_range: \"\"\n    known_biases: \"\"\n  evaluation:\n    metrics: {}\n    datasets: []\n    sliced_metrics: {}             # Performance by subgroup\n  limitations: []\n  ethical_considerations: []\n  maintenance:\n    retraining_schedule: \"\"\n    monitoring: \"\"\n    contact: \"\""
      },
      {
        "title": "GPU Selection Guide",
        "body": "Use CaseGPUVRAMCost/hr (cloud)Best ForFine-tune 7B modelA10G24GB~$1LoRA/QLoRA fine-tuningFine-tune 70B modelA100 80GB80GB~$4Full fine-tuning medium modelsServe 7B modelT416GB~$0.50Inference at scaleServe 70B modelA100 40GB40GB~$2Large model inferenceTrain from scratchH10080GB~$8Pre-training, large-scale training"
      },
      {
        "title": "Inference Optimization Techniques",
        "body": "TechniqueSpeedupQuality ImpactComplexityQuantization (INT8)2-3x<1% degradationLowQuantization (INT4/GPTQ)3-4x1-3% degradationMediumBatching2-10x throughputNoneLowKV-cache optimization20-40% memory savingsNoneMediumSpeculative decoding2-3x for LLMsNone (mathematically exact)HighModel distillation5-10x smaller model2-5% degradationHighONNX Runtime1.5-3xNoneLowTensorRT2-5x<1%MediumvLLM (PagedAttention)2-4x throughput for LLMsNoneLow"
      },
      {
        "title": "Cost Tracking Template",
        "body": "ml_costs:\n  training:\n    compute_cost_per_run: null\n    runs_per_month: null\n    data_storage_monthly: null\n    experiment_tracking: null\n  inference:\n    cost_per_1k_predictions: null\n    daily_volume: null\n    monthly_cost: null\n    cost_per_query_breakdown:\n      compute: null\n      model_api_calls: null\n      vector_db: null\n      data_transfer: null\n  optimization_targets:\n    cost_per_prediction: null      # Target\n    monthly_budget: null\n    cost_reduction_goal: \"\""
      },
      {
        "title": "Phase 11: ML System Quality Rubric",
        "body": "Score your ML system (0-100):\n\nDimensionWeight0-2 (Poor)3-4 (Good)5 (Excellent)Problem framing15%No clear business metricDefined success metricKill criteria + baseline + ROI estimateData quality15%Ad-hoc data, no validationAutomated quality checksFeature store + lineage + versioningExperiment rigor15%No tracking, one-off notebooksMLflow/W&B trackingReproducible pipelines + proper evaluationModel performance15%Barely beats baselineSignificant improvementCalibrated, fair, robust to edge casesDeployment10%Manual deploymentCI/CD for modelsCanary + auto-rollback + A/B testingMonitoring15%No monitoringBasic metrics dashboardDrift detection + auto-retraining + alertsDocumentation5%Nothing documentedModel card existsFull model card + runbooks + decision logCost efficiency10%No cost trackingBudget existsOptimized inference + cost-per-prediction tracking\n\nScoring:\n\n80-100: Production-grade ML system\n60-79: Good foundations, missing operational maturity\n40-59: Prototype quality, not ready for production\n<40: Science project, needs fundamental rework"
      },
      {
        "title": "Common Mistakes",
        "body": "MistakeFixOptimizing model before fixing dataData quality > model complexity. Always.Using accuracy on imbalanced dataUse PR-AUC, F1, or domain-specific metricNo baseline comparisonAlways start with simple heuristic baselineTraining on future dataTemporal splits for time-series, strict leakage checksDeploying without monitoringNo model in production without drift detectionFine-tuning when prompting worksTry few-shot prompting first — fine-tune only for scale/costGPU for everythingCPU inference is often sufficient and 10x cheaperIgnoring calibrationIf probabilities matter (risk scoring), calibrateOne-time model deploymentML is a continuous system — plan for retraining from day 1Premature scalingProve value with batch predictions before building real-time serving"
      },
      {
        "title": "Quick Commands",
        "body": "\"Frame ML problem\" → Phase 1 brief\n\"Assess data quality\" → Phase 2 scoring\n\"Select model\" → Phase 3 guide\n\"Evaluate model\" → Phase 4 checklist\n\"Deploy model\" → Phase 5 serving config\n\"Build RAG\" → Phase 6 RAG architecture\n\"Set up monitoring\" → Phase 7 dashboard\n\"Optimize costs\" → Phase 10 tracking\n\"Score ML system\" → Phase 11 rubric\n\"Detect drift\" → Phase 7 playbook\n\"A/B test model\" → Phase 5 template\n\"Create model card\" → Phase 9 template"
      }
    ],
    "body": "ML & AI Engineering System\n\nComplete methodology for building, deploying, and operating production ML/AI systems — from experiment to scale.\n\nPhase 1: Problem Framing\n\nBefore writing any code, define the ML problem precisely.\n\nML Problem Brief\nproblem_brief:\n  business_objective: \"\"          # What business metric improves?\n  success_metric: \"\"              # Quantified target (e.g., \"reduce churn 15%\")\n  baseline: \"\"                    # Current performance without ML\n  ml_task_type: \"\"                # classification | regression | ranking | generation | clustering | anomaly_detection | recommendation\n  prediction_target: \"\"           # What exactly are we predicting?\n  prediction_consumer: \"\"         # Who/what uses the prediction? (API | dashboard | email | automated action)\n  latency_requirement: \"\"         # real-time (<100ms) | near-real-time (<1s) | batch (minutes-hours)\n  data_available: \"\"              # What data exists today?\n  data_gaps: \"\"                   # What's missing?\n  ethical_considerations: \"\"      # Bias risks, fairness requirements, privacy\n  kill_criteria:                  # When to abandon the ML approach\n    - \"Baseline heuristic achieves >90% of ML performance\"\n    - \"Data quality too poor after 2 weeks of cleaning\"\n    - \"Model can't beat random by >10% on holdout set\"\n\nML vs Rules Decision\nSignal\tUse Rules\tUse ML\nLogic is explainable in <10 rules\t✅\t❌\nPattern is too complex for humans\t❌\t✅\nTraining data >1,000 labeled examples\t—\t✅\nNeeds to adapt to new patterns\t❌\t✅\nMust be 100% auditable/deterministic\t✅\t❌\nPattern changes faster than you can update rules\t❌\t✅\n\nRule of thumb: Start with rules/heuristics. Only add ML when rules fail to capture the pattern.\n\nPhase 2: Data Engineering for ML\nData Quality Assessment\n\nScore each data source (0-5 per dimension):\n\nDimension\t0 (Terrible)\t5 (Excellent)\nCompleteness\t>50% missing\t<1% missing\nAccuracy\tKnown errors, no validation\tValidated against source of truth\nConsistency\tDifferent formats, duplicates\tStandardized, deduplicated\nTimeliness\tMonths stale\tReal-time or daily refresh\nRelevance\tWeak proxy for target\tDirect signal for prediction\nVolume\t<100 samples\t>10,000 samples per class\n\nMinimum score to proceed: 18/30. Below 18 → fix data first, don't build models.\n\nFeature Engineering Patterns\nfeature_types:\n  numerical:\n    - raw_value           # Use as-is if normally distributed\n    - log_transform       # Right-skewed distributions (revenue, counts)\n    - standardize         # z-score for algorithms sensitive to scale (SVM, KNN, neural nets)\n    - bin_to_categorical  # When relationship is non-linear and data is limited\n  categorical:\n    - one_hot             # <20 categories, tree-based models handle natively\n    - target_encoding     # High-cardinality (>20 categories), use with K-fold to prevent leakage\n    - embedding           # Very high-cardinality (user IDs, product IDs) with deep learning\n  temporal:\n    - lag_features        # Value at t-1, t-7, t-30\n    - rolling_statistics  # Mean, std, min, max over windows\n    - time_since_event    # Days since last purchase, hours since login\n    - cyclical_encoding   # sin/cos for hour-of-day, day-of-week, month\n  text:\n    - tfidf               # Simple, interpretable, good baseline\n    - sentence_embeddings # semantic similarity, modern NLP\n    - llm_extraction      # Use LLM to extract structured fields from unstructured text\n  interaction:\n    - ratios              # Feature A / Feature B (e.g., clicks/impressions = CTR)\n    - differences         # Feature A - Feature B (e.g., price - competitor_price)\n    - polynomial          # A * B, A^2 (use sparingly, high-cardinality features)\n\nFeature Store Design\nfeature_store:\n  offline_store:         # For training — batch computed, stored in data warehouse\n    storage: \"BigQuery | Snowflake | S3+Parquet\"\n    compute: \"Spark | dbt | SQL\"\n    refresh: \"daily | hourly\"\n  online_store:          # For serving — low-latency lookups\n    storage: \"Redis | DynamoDB | Feast online\"\n    latency_target: \"<10ms p99\"\n    refresh: \"streaming | near-real-time\"\n  registry:              # Feature metadata\n    naming: \"{entity}_{feature_name}_{window}_{aggregation}\"  # e.g., user_purchase_count_30d_sum\n    documentation:\n      - description\n      - data_type\n      - source_table\n      - owner\n      - created_date\n      - known_issues\n\nData Leakage Prevention Checklist\n No future information in features (time-travel check)\n Train/val/test split done BEFORE feature engineering\n Target encoding uses only training fold statistics\n No features derived from the target variable\n Temporal splits for time-series (no random shuffle)\n Holdout set created BEFORE any EDA\n Duplicates removed BEFORE splitting (same entity not in train AND test)\n Normalization/scaling fit on train, applied to val/test\nPhase 3: Experiment Management\nExperiment Tracking Template\nexperiment:\n  id: \"EXP-{YYYY-MM-DD}-{NNN}\"\n  hypothesis: \"\"                 # \"Adding user tenure features will improve churn prediction AUC by >2%\"\n  dataset_version: \"\"            # Hash or version of training data\n  features_used: []              # List of feature names\n  model_type: \"\"                 # Algorithm name\n  hyperparameters: {}            # All hyperparams logged\n  training_time: \"\"              # Wall clock\n  metrics:\n    primary: {}                  # The one metric that matters\n    secondary: {}                # Supporting metrics\n  baseline_comparison: \"\"        # Delta vs baseline\n  verdict: \"promoted | archived | iterate\"\n  notes: \"\"\n  artifacts:\n    - model_path: \"\"\n    - notebook_path: \"\"\n    - confusion_matrix: \"\"\n\nModel Selection Guide\nTask\tStart With\tScale To\tAvoid\nTabular classification\tXGBoost/LightGBM\tNeural nets only if >100K samples\tDeep learning on <10K samples\nTabular regression\tXGBoost/LightGBM\tCatBoost for high-cardinality cats\tLinear regression without feature engineering\nImage classification\tFine-tune ResNet/EfficientNet\tVision Transformer if >100K images\tTraining from scratch\nText classification\tFine-tune BERT/RoBERTa\tLLM few-shot if labeled data scarce\tBag-of-words for nuanced tasks\nText generation\tGPT-4/Claude API\tFine-tuned Llama/Mistral for cost\tTraining from scratch\nTime series\tProphet/ARIMA baseline → LightGBM\tTemporal Fusion Transformer\tLSTM without strong reason\nRecommendation\tCollaborative filtering baseline\tTwo-tower neural\tComplex models on <1K users\nAnomaly detection\tIsolation Forest\tAutoencoder if high-dimensional\tSupervised methods without labeled anomalies\nSearch/ranking\tBM25 baseline → Learning to Rank\tCross-encoder reranking\tPure keyword without semantic\nHyperparameter Tuning Strategy\nManual first — understand 3-5 most impactful parameters\nBayesian optimization (Optuna) — 50-100 trials for production models\nGrid search — only for final fine-tuning of 2-3 parameters\nRandom search — better than grid for >4 parameters\n\nKey hyperparameters by model:\n\nModel\tCritical Params\tTypical Range\nXGBoost\tlearning_rate, max_depth, n_estimators, min_child_weight\t0.01-0.3, 3-10, 100-1000, 1-10\nLightGBM\tlearning_rate, num_leaves, feature_fraction, min_data_in_leaf\t0.01-0.3, 15-255, 0.5-1.0, 5-100\nNeural Net\tlearning_rate, batch_size, hidden_dims, dropout\t1e-5 to 1e-2, 32-512, arch-dependent, 0.1-0.5\nRandom Forest\tn_estimators, max_depth, min_samples_leaf\t100-1000, 5-30, 1-20\nPhase 4: Model Evaluation\nMetric Selection by Task\nTask\tPrimary Metric\tWhen to Use\tWatch Out For\nBinary classification (balanced)\tF1-score\tEqual importance of precision/recall\t—\nBinary classification (imbalanced)\tPR-AUC\tRare positive class (<5%)\tROC-AUC hides poor performance on minority\nMulti-class\tMacro F1\tAll classes equally important\tMicro F1 if class frequency = importance\nRegression\tMAE\tOutliers should not dominate\tRMSE penalizes large errors more\nRanking\tNDCG@K\tTop-K results matter most\tMAP if binary relevance\nGeneration\tHuman eval + automated\tQuality is subjective\tBLEU/ROUGE alone are insufficient\nAnomaly detection\tPrecision@K\tFalse positives are expensive\tRecall if missing anomalies is dangerous\nEvaluation Rigor Checklist\n Metrics computed on TRUE holdout (never seen during training OR tuning)\n Cross-validation for small datasets (<10K samples)\n Stratified splits for imbalanced classes\n Temporal split for time-dependent data\n Confidence intervals reported (bootstrap or cross-val)\n Performance broken down by important segments (geography, user cohort, etc.)\n Fairness metrics across protected groups\n Comparison against simple baseline (majority class, mean prediction, rules)\n Error analysis: examined top 50 worst predictions manually\n Calibration plot for probabilistic predictions\nOffline-to-Online Gap Analysis\n\nBefore deploying, verify these don't cause train-serving skew:\n\nCheck\tOffline\tOnline\tAction\nFeature computation\tBatch SQL\tReal-time API\tVerify same logic, test with replay\nData freshness\tPoint-in-time snapshot\tLatest value\tDocument acceptable staleness\nMissing values\tImputed in pipeline\tMay be truly missing\tHandle gracefully in serving\nFeature distributions\tTraining period\tCurrent period\tMonitor drift post-deploy\nPhase 5: Model Deployment\nDeployment Pattern Decision Tree\nIs latency < 100ms required?\n├── Yes → Is model < 500MB?\n│   ├── Yes → Embedded serving (FastAPI + model in memory)\n│   └── No → Model server (Triton, TorchServe, vLLM)\n└── No → Is it a batch prediction?\n    ├── Yes → Batch pipeline (Spark, Airflow + offline inference)\n    └── No → Async queue (Celery/SQS → worker → result store)\n\nProduction Serving Checklist\nserving_config:\n  model:\n    format: \"\"                    # ONNX | TorchScript | SavedModel | safetensors\n    version: \"\"                   # Semantic version\n    size_mb: null\n    load_time_seconds: null\n  infrastructure:\n    compute: \"\"                   # CPU | GPU (T4/A10/A100/H100)\n    instances: null               # Min/max for autoscaling\n    autoscale_metric: \"\"          # RPS | latency_p99 | GPU_utilization\n    autoscale_target: null\n  api:\n    endpoint: \"\"\n    input_schema: {}              # Pydantic model or JSON schema\n    output_schema: {}\n    timeout_ms: null\n    rate_limit: null\n  reliability:\n    health_check: \"/health\"\n    readiness_check: \"/ready\"     # Model loaded and warm\n    graceful_shutdown: true\n    circuit_breaker: true\n    fallback: \"\"                  # Rules-based fallback when model is down\n\nContainerization Template\n# Multi-stage build for minimal image\nFROM python:3.11-slim AS builder\nWORKDIR /app\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\n\nFROM python:3.11-slim\nWORKDIR /app\nCOPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages\nCOPY --from=builder /usr/local/bin /usr/local/bin\nCOPY model/ ./model/\nCOPY src/ ./src/\n\n# Non-root user\nRUN useradd -m appuser && chown -R appuser /app\nUSER appuser\n\n# Health check\nHEALTHCHECK --interval=30s --timeout=5s CMD curl -f http://localhost:8080/health || exit 1\n\nEXPOSE 8080\nCMD [\"uvicorn\", \"src.serve:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8080\"]\n\nA/B Testing for Models\nab_test:\n  name: \"\"\n  hypothesis: \"\"\n  primary_metric: \"\"              # Business metric (revenue, engagement, etc.)\n  guardrail_metrics: []           # Metrics that must NOT degrade\n  traffic_split:\n    control: 50                   # Current model\n    treatment: 50                 # New model\n  minimum_sample_size: null       # Power analysis: use statsmodels or online calculator\n  minimum_runtime_days: null      # At least 1 full business cycle (7 days min)\n  decision_criteria:\n    ship: \"Treatment > control by >X% with p<0.05 AND no guardrail regression\"\n    iterate: \"Promising signal but not significant — extend test or refine model\"\n    kill: \"No improvement after 2x minimum runtime OR guardrail breach\"\n\nPhase 6: LLM Engineering\nLLM Application Architecture\n┌─────────────────────────────────────────────┐\n│              Application Layer               │\n│  (Prompt templates, chains, output parsers)  │\n├─────────────────────────────────────────────┤\n│              Orchestration Layer              │\n│  (Routing, fallback, retry, caching)         │\n├─────────────────────────────────────────────┤\n│              Model Layer                     │\n│  (API calls, fine-tuned models, embeddings)  │\n├─────────────────────────────────────────────┤\n│              Data Layer                      │\n│  (Vector store, context retrieval, memory)   │\n└─────────────────────────────────────────────┘\n\nModel Selection for LLM Tasks\nTask\tBest Option\tCost-Effective Option\tWhen to Fine-Tune\nGeneral reasoning\tClaude Opus / GPT-4o\tClaude Sonnet / GPT-4o-mini\tNever for general reasoning\nClassification\tFine-tuned small model\tFew-shot with Sonnet\t>1,000 labeled examples + high volume\nExtraction\tStructured output API\tRegex + LLM fallback\tConsistent format needed at scale\nSummarization\tClaude Sonnet\tGPT-4o-mini\tDomain-specific style needed\nCode generation\tClaude Sonnet\tCodestral / DeepSeek\tInternal codebase conventions\nEmbeddings\ttext-embedding-3-large\ttext-embedding-3-small\tDomain-specific vocab (medical, legal)\nRAG System Architecture\nrag_pipeline:\n  ingestion:\n    chunking:\n      strategy: \"semantic\"         # semantic | fixed_size | recursive\n      chunk_size: 512              # tokens (512-1024 for most use cases)\n      overlap: 50                  # tokens overlap between chunks\n      metadata_to_preserve:\n        - source_document\n        - page_number\n        - section_heading\n        - date_created\n    embedding:\n      model: \"text-embedding-3-large\"\n      dimensions: 1536             # Or 256/512 with Matryoshka for cost savings\n    vector_store: \"Pinecone | Weaviate | pgvector | Qdrant\"\n  retrieval:\n    strategy: \"hybrid\"             # dense | sparse | hybrid (recommended)\n    top_k: 10                      # Retrieve more, then rerank\n    reranking:\n      model: \"Cohere rerank | cross-encoder\"\n      top_n: 3                     # Final context chunks\n    filters: []                    # Metadata filters (date range, source, etc.)\n  generation:\n    model: \"\"\n    system_prompt: |\n      Answer based ONLY on the provided context.\n      If the context doesn't contain the answer, say \"I don't have enough information.\"\n      Cite sources using [Source: document_name, page X].\n    temperature: 0.1               # Low for factual, higher for creative\n    max_tokens: null\n\nRAG Quality Checklist\n Chunking preserves semantic meaning (not cutting mid-sentence)\n Metadata enables filtering (dates, sources, categories)\n Retrieval returns relevant chunks (test with 50+ queries manually)\n Reranking improves precision (compare with/without)\n System prompt prevents hallucination (tested with adversarial queries)\n Sources are cited and verifiable\n Handles \"I don't know\" gracefully\n Latency acceptable (<3s for interactive, <30s for complex)\n Cost per query tracked and within budget\nLLM Cost Optimization\nStrategy\tSavings\tTrade-off\nPrompt caching\t50-90% on repeated prefixes\tRequires cache-friendly prompt design\nModel routing (small → large)\t40-70%\tSlightly higher latency, need router logic\nBatch API\t50%\tHours of delay, batch-only workloads\nShorter prompts\tLinear with token reduction\tMay reduce quality\nFine-tuned small model\t80-95% vs large model API\tTraining cost + maintenance\nSemantic caching\t50-80% for similar queries\tMay return stale/wrong cached result\nOutput token limits\tProportional\tMay truncate useful information\nPhase 7: Model Monitoring\nMonitoring Dashboard\nmonitoring:\n  model_performance:\n    metrics:\n      - name: \"primary_metric\"         # Same as offline evaluation\n        threshold: null                 # Alert if below\n        window: \"1h | 1d | 7d\"\n      - name: \"prediction_distribution\"\n        alert: \"KL divergence > 0.1 from training distribution\"\n    latency:\n      p50_ms: null\n      p95_ms: null\n      p99_ms: null\n      alert_threshold_ms: null\n    throughput:\n      requests_per_second: null\n      error_rate_threshold: 0.01       # Alert if >1% errors\n  data_drift:\n    method: \"PSI | KS-test | JS-divergence\"\n    features_to_monitor: []            # Top 10 most important features\n    check_frequency: \"hourly | daily\"\n    alert_threshold: null              # PSI > 0.2 = significant drift\n  concept_drift:\n    method: \"performance_degradation\"\n    ground_truth_delay: \"\"             # How long until we get labels?\n    proxy_metrics: []                  # Metrics available before ground truth\n    retraining_trigger: \"\"             # When to retrain\n\nDrift Response Playbook\nDrift Type\tDetection\tSeverity\tResponse\nFeature drift (input distribution shifts)\tPSI > 0.1\tWarning\tInvestigate cause, monitor performance\nFeature drift (PSI > 0.25)\tPSI > 0.25\tCritical\tRetrain on recent data within 24h\nConcept drift (relationship changes)\tPerformance drop >5%\tCritical\tRetrain with new labels, review features\nLabel drift (target distribution changes)\tChi-square test\tWarning\tVerify label quality, check for data issues\nPrediction drift (output distribution shifts)\tKL divergence\tWarning\tMay indicate upstream data issue\nAutomated Retraining Pipeline\nretraining:\n  triggers:\n    - type: \"scheduled\"\n      frequency: \"weekly | monthly\"\n    - type: \"performance\"\n      condition: \"primary_metric < threshold for 24h\"\n    - type: \"drift\"\n      condition: \"PSI > 0.2 on any top-10 feature\"\n  pipeline:\n    1_data_validation:\n      - check_completeness\n      - check_distribution_shift\n      - check_label_quality\n    2_training:\n      - use_latest_N_months_data\n      - same_hyperparameters_as_production   # Unless scheduled tuning\n      - log_all_metrics\n    3_evaluation:\n      - compare_vs_production_model\n      - must_beat_production_on_primary_metric\n      - must_not_regress_on_guardrail_metrics\n      - evaluate_on_golden_test_set\n    4_deployment:\n      - canary_deployment: 5%\n      - monitor_for: \"4h minimum\"\n      - auto_rollback_if: \"error_rate > 2x baseline\"\n      - gradual_rollout: \"5% → 25% → 50% → 100%\"\n    5_notification:\n      - log_retraining_event\n      - notify_team_on_failure\n      - update_model_registry\n\nPhase 8: MLOps Infrastructure\nML Platform Components\nComponent\tPurpose\tTools\nExperiment tracking\tLog runs, compare results\tMLflow, W&B, Neptune\nFeature store\tCentralized feature management\tFeast, Tecton, Hopsworks\nModel registry\tVersion, stage, approve models\tMLflow Registry, SageMaker\nPipeline orchestration\tDAG-based ML workflows\tAirflow, Prefect, Dagster, Kubeflow\nModel serving\tLow-latency inference\tTriton, TorchServe, vLLM, BentoML\nMonitoring\tDrift, performance, data quality\tEvidently, Whylogs, Great Expectations\nVector store\tEmbedding storage for RAG\tPinecone, Weaviate, pgvector, Qdrant\nGPU management\tTraining and inference compute\tK8s + GPU operator, RunPod, Modal\nCI/CD for ML\nml_cicd:\n  on_code_change:\n    - lint_and_type_check\n    - unit_tests (data transforms, feature logic)\n    - integration_tests (pipeline end-to-end on sample data)\n  on_data_change:\n    - data_validation (Great Expectations / custom)\n    - feature_pipeline_run\n    - smoke_test_predictions\n  on_model_change:\n    - full_evaluation_suite\n    - bias_and_fairness_check\n    - performance_regression_test\n    - model_size_and_latency_check\n    - security_scan (model file, dependencies)\n    - staging_deployment\n    - integration_test_in_staging\n    - approval_gate (manual for major versions)\n    - canary_deployment\n\nModel Registry Workflow\n┌──────────────┐      ┌──────────────┐      ┌──────────────┐\n│  Development │ ───→ │   Staging    │ ───→ │  Production  │\n│              │      │              │      │              │\n│ - Experiment │      │ - Eval suite │      │ - Canary     │\n│ - Log metrics│      │ - Load test  │      │ - Monitor    │\n│ - Compare    │      │ - Approval   │      │ - Rollback   │\n└──────────────┘      └──────────────┘      └──────────────┘\n\n\nPromotion criteria:\n\nDev → Staging: Beats current production on offline metrics\nStaging → Production: Passes load test + integration test + human approval\nAuto-rollback: Error rate >2x OR latency >2x OR primary metric drops >5%\nPhase 9: Responsible AI\nBias Detection Checklist\n Training data represents all demographic groups proportionally\n Performance metrics broken down by protected attributes\n Equal opportunity: similar true positive rates across groups\n Calibration: predicted probabilities match actual rates per group\n No proxy features for protected attributes (ZIP code → race)\n Fairness metric selected and threshold defined BEFORE training\n Disparate impact ratio >0.8 (80% rule)\n Edge cases tested: what happens with unusual inputs?\nModel Card Template\nmodel_card:\n  model_name: \"\"\n  version: \"\"\n  date: \"\"\n  owner: \"\"\n  description: \"\"\n  intended_use: \"\"\n  out_of_scope_uses: \"\"\n  training_data:\n    source: \"\"\n    size: \"\"\n    date_range: \"\"\n    known_biases: \"\"\n  evaluation:\n    metrics: {}\n    datasets: []\n    sliced_metrics: {}             # Performance by subgroup\n  limitations: []\n  ethical_considerations: []\n  maintenance:\n    retraining_schedule: \"\"\n    monitoring: \"\"\n    contact: \"\"\n\nPhase 10: Cost & Performance Optimization\nGPU Selection Guide\nUse Case\tGPU\tVRAM\tCost/hr (cloud)\tBest For\nFine-tune 7B model\tA10G\t24GB\t~$1\tLoRA/QLoRA fine-tuning\nFine-tune 70B model\tA100 80GB\t80GB\t~$4\tFull fine-tuning medium models\nServe 7B model\tT4\t16GB\t~$0.50\tInference at scale\nServe 70B model\tA100 40GB\t40GB\t~$2\tLarge model inference\nTrain from scratch\tH100\t80GB\t~$8\tPre-training, large-scale training\nInference Optimization Techniques\nTechnique\tSpeedup\tQuality Impact\tComplexity\nQuantization (INT8)\t2-3x\t<1% degradation\tLow\nQuantization (INT4/GPTQ)\t3-4x\t1-3% degradation\tMedium\nBatching\t2-10x throughput\tNone\tLow\nKV-cache optimization\t20-40% memory savings\tNone\tMedium\nSpeculative decoding\t2-3x for LLMs\tNone (mathematically exact)\tHigh\nModel distillation\t5-10x smaller model\t2-5% degradation\tHigh\nONNX Runtime\t1.5-3x\tNone\tLow\nTensorRT\t2-5x\t<1%\tMedium\nvLLM (PagedAttention)\t2-4x throughput for LLMs\tNone\tLow\nCost Tracking Template\nml_costs:\n  training:\n    compute_cost_per_run: null\n    runs_per_month: null\n    data_storage_monthly: null\n    experiment_tracking: null\n  inference:\n    cost_per_1k_predictions: null\n    daily_volume: null\n    monthly_cost: null\n    cost_per_query_breakdown:\n      compute: null\n      model_api_calls: null\n      vector_db: null\n      data_transfer: null\n  optimization_targets:\n    cost_per_prediction: null      # Target\n    monthly_budget: null\n    cost_reduction_goal: \"\"\n\nPhase 11: ML System Quality Rubric\n\nScore your ML system (0-100):\n\nDimension\tWeight\t0-2 (Poor)\t3-4 (Good)\t5 (Excellent)\nProblem framing\t15%\tNo clear business metric\tDefined success metric\tKill criteria + baseline + ROI estimate\nData quality\t15%\tAd-hoc data, no validation\tAutomated quality checks\tFeature store + lineage + versioning\nExperiment rigor\t15%\tNo tracking, one-off notebooks\tMLflow/W&B tracking\tReproducible pipelines + proper evaluation\nModel performance\t15%\tBarely beats baseline\tSignificant improvement\tCalibrated, fair, robust to edge cases\nDeployment\t10%\tManual deployment\tCI/CD for models\tCanary + auto-rollback + A/B testing\nMonitoring\t15%\tNo monitoring\tBasic metrics dashboard\tDrift detection + auto-retraining + alerts\nDocumentation\t5%\tNothing documented\tModel card exists\tFull model card + runbooks + decision log\nCost efficiency\t10%\tNo cost tracking\tBudget exists\tOptimized inference + cost-per-prediction tracking\n\nScoring:\n\n80-100: Production-grade ML system\n60-79: Good foundations, missing operational maturity\n40-59: Prototype quality, not ready for production\n<40: Science project, needs fundamental rework\nCommon Mistakes\nMistake\tFix\nOptimizing model before fixing data\tData quality > model complexity. Always.\nUsing accuracy on imbalanced data\tUse PR-AUC, F1, or domain-specific metric\nNo baseline comparison\tAlways start with simple heuristic baseline\nTraining on future data\tTemporal splits for time-series, strict leakage checks\nDeploying without monitoring\tNo model in production without drift detection\nFine-tuning when prompting works\tTry few-shot prompting first — fine-tune only for scale/cost\nGPU for everything\tCPU inference is often sufficient and 10x cheaper\nIgnoring calibration\tIf probabilities matter (risk scoring), calibrate\nOne-time model deployment\tML is a continuous system — plan for retraining from day 1\nPremature scaling\tProve value with batch predictions before building real-time serving\nQuick Commands\n\"Frame ML problem\" → Phase 1 brief\n\"Assess data quality\" → Phase 2 scoring\n\"Select model\" → Phase 3 guide\n\"Evaluate model\" → Phase 4 checklist\n\"Deploy model\" → Phase 5 serving config\n\"Build RAG\" → Phase 6 RAG architecture\n\"Set up monitoring\" → Phase 7 dashboard\n\"Optimize costs\" → Phase 10 tracking\n\"Score ML system\" → Phase 11 rubric\n\"Detect drift\" → Phase 7 playbook\n\"A/B test model\" → Phase 5 template\n\"Create model card\" → Phase 9 template"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-ml-engineering",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-ml-engineering",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-ml-engineering",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-ml-engineering/agent.md"
  }
}