{
  "schemaVersion": "1.0",
  "item": {
    "slug": "afrexai-data-engineering",
    "name": "Data Engineering",
    "source": "tencent",
    "type": "skill",
    "category": "其他",
    "sourceUrl": "https://clawhub.ai/1kalin/afrexai-data-engineering",
    "canonicalUrl": "https://clawhub.ai/1kalin/afrexai-data-engineering",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/afrexai-data-engineering",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-data-engineering",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "README.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "slug": "afrexai-data-engineering",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-29T04:37:47.007Z",
      "expiresAt": "2026-05-06T04:37:47.007Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-data-engineering",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=afrexai-data-engineering",
        "contentDisposition": "attachment; filename=\"afrexai-data-engineering-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null,
        "slug": "afrexai-data-engineering"
      },
      "scope": "item",
      "summary": "Item download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this item.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/afrexai-data-engineering"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/afrexai-data-engineering",
    "agentPageUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Then review README.md for any prerequisites, environment setup, or post-install checks. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Then review README.md for any prerequisites, environment setup, or post-install checks. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Data Engineering Command Center",
        "body": "Complete methodology for designing, building, operating, and scaling data pipelines and infrastructure. Zero dependencies — pure agent skill."
      },
      {
        "title": "Phase 1: Data Architecture Assessment",
        "body": "Before building anything, understand the landscape."
      },
      {
        "title": "Architecture Brief",
        "body": "project_name: \"\"\nbusiness_context: \"\"\ndata_consumers:\n  - team: \"\"\n    use_case: \"\"          # analytics | ML | operational | reporting | reverse-ETL\n    latency_requirement: \"\"  # real-time (<1s) | near-real-time (<5min) | batch (hourly+)\n    query_pattern: \"\"     # ad-hoc | scheduled | API | dashboard\n\ncurrent_state:\n  sources: []             # list every system producing data\n  storage: []             # where data lives today\n  pain_points: []         # what's broken, slow, unreliable\n  data_volume:\n    current_gb_per_day: 0\n    growth_rate_percent: 0\n    retention_months: 0\n\nconstraints:\n  budget_monthly_usd: 0\n  team_size: 0\n  skill_level: \"\"         # junior | mid | senior | mixed\n  compliance: []          # GDPR, HIPAA, SOX, PCI, none\n  cloud_provider: \"\"      # AWS | GCP | Azure | multi | on-prem"
      },
      {
        "title": "Architecture Pattern Decision Matrix",
        "body": "SignalPatternWhen to UseAll consumers need data hourly+Batch ETLReporting, warehousing, most analyticsSome need <5 min latencyMicro-batchDashboard freshness, near-real-time analyticsEvents need <1s processingStreamingFraud detection, real-time pricing, alertsNeed both batch + streamingLambdaWhen batch accuracy + real-time speed both matterWant to simplify LambdaKappaWhen you can reprocess from stream replayData lake + warehouse combinedLakehouseWhen you need both cheap storage + fast SQLSources change independentlyData MeshLarge orgs, domain-owned data, >5 teamsML is primary consumerFeature StoreML-heavy orgs with feature reuse needs"
      },
      {
        "title": "Technology Selection Guide",
        "body": "Orchestration\n\nToolBest ForAvoid WhenAirflowComplex DAGs, Python-native teams, mature ecosystemSimple pipelines (<5 tasks)DagsterSoftware-defined assets, strong typing, dev experienceLegacy team resistant to new paradigmsPrefectDynamic workflows, cloud-native, Python-firstNeed on-prem with no cloud dependencydbtSQL transformations, ELT, analytics engineeringNon-SQL transforms, streamingTemporalLong-running workflows, retry-heavy, microservicesSimple ETL, small teamsCron + scripts<3 pipelines, solo engineer, simple schedulesAnything with dependencies or retries\n\nProcessing\n\nToolBest ForAvoid WhenSpark>100GB, complex transforms, ML pipelines<10GB (overkill), real-time streamingDuckDBLocal analytics, <100GB, SQL on filesDistributed processing, production streamingPolarsSingle-node, Rust-speed, <50GB, DataFramesDistributed, need Spark ecosystemPandas<1GB, quick analysis, prototypingProduction pipelines, anything >5GBFlinkTrue streaming, event-time processingBatch-only, small team (steep learning curve)SQL (warehouse)ELT in Snowflake/BigQuery/RedshiftComplex ML transforms, binary data\n\nStorage\n\nToolBest ForAvoid WhenSnowflakeAnalytics, separation of compute/storage, multi-cloudTight budget, real-time OLTPBigQueryGCP-native, serverless, large-scale analyticsMulti-cloud, need fine-grained cost controlRedshiftAWS-native, existing AWS ecosystemElastic scaling needs, multi-cloudDatabricksML + analytics unified, Spark-native, lakehousePure SQL analytics, small dataPostgreSQLOLTP + light analytics, <500GB, budget-conscious>1TB analytics, real-time dashboards on large dataS3/GCS/ADLSRaw data lake, cheap storage, any formatDirect SQL queries (need compute layer)Delta Lake/IcebergTable format on data lake, ACID on filesSimple file storage, no lakehouse need"
      },
      {
        "title": "Modeling Methodology Decision",
        "body": "ApproachBest ForKey ConceptKimball (Dimensional)BI/reporting, star schemasFacts + Dimensions, business-process-centricInmon (3NF)Enterprise data warehouse, single source of truthNormalized, subject-area-centricData Vault 2.0Agile warehousing, auditability, multiple sourcesHubs + Links + Satellites, insert-onlyOne Big Table (OBT)Simple analytics, few joins, dashboard performancePre-joined, denormalized, fast queriesActivity SchemaEvent analytics, product analyticsEntity + Activity + Feature columns"
      },
      {
        "title": "Dimensional Model Template",
        "body": "fact_table:\n  name: \"fact_[business_process]\"\n  grain: \"\"                    # one row = one [what]?\n  grain_statement: \"One row per [transaction/event/snapshot] at [time grain]\"\n  measures:\n    - name: \"\"\n      type: \"\"                 # additive | semi-additive | non-additive\n      aggregation: \"\"          # SUM | AVG | COUNT | MIN | MAX | COUNT DISTINCT\n      business_definition: \"\"\n  degenerate_dimensions: []    # IDs stored in fact (order_number, invoice_id)\n  foreign_keys: []             # links to dimension tables\n\ndimensions:\n  - name: \"dim_[entity]\"\n    type: \"\"                   # Type 1 (overwrite) | Type 2 (history) | Type 3 (previous value)\n    natural_key: \"\"            # business key from source\n    surrogate_key: \"\"          # warehouse-generated key\n    attributes:\n      - name: \"\"\n        source: \"\"\n        scd_type: \"\"           # 1 | 2 | 3\n    hierarchy: []              # e.g., [country, region, city, store]"
      },
      {
        "title": "SCD Type Decision Guide",
        "body": "ScenarioSCD TypeImplementationDon't care about historyType 1UPDATE in placeNeed full historyType 2New row + valid_from/valid_to + is_current flagOnly need previous valueType 3Add previous_[column]Track changes with timestampsType 4Mini-dimension (history table)Hybrid: some attrs Type 1, some Type 2Type 6Combine 1+2+3 in one table\n\nDefault recommendation: Type 2 for anything business-critical (customer status, product price, employee department). Type 1 for everything else."
      },
      {
        "title": "Naming Conventions",
        "body": "ObjectConventionExampleRaw/staging tablesraw_[source]_[table]raw_stripe_paymentsStaging modelsstg_[source]__[entity]stg_stripe__paymentsIntermediate modelsint_[entity]_[verb]int_orders_pivotedMart/fact tablesfct_[business_process]fct_ordersDimension tablesdim_[entity]dim_customersMetrics/aggregatesmrt_[domain]_[metric]mrt_sales_dailySnapshotssnp_[entity]_[grain]snp_inventory_dailyColumns: booleanis_[state] or has_[thing]is_active, has_subscriptionColumns: timestamp[event]_atcreated_at, shipped_atColumns: date[event]_dateorder_dateColumns: ID[entity]_idcustomer_idColumns: amount[thing]_amountorder_amountColumns: count[thing]_countline_item_count"
      },
      {
        "title": "Universal Pipeline Template",
        "body": "pipeline:\n  name: \"\"\n  owner: \"\"\n  schedule: \"\"               # cron expression\n  sla_minutes: 0             # max acceptable runtime\n  tier: \"\"                   # 1 (critical) | 2 (important) | 3 (nice-to-have)\n\n  extract:\n    source_system: \"\"\n    connection: \"\"\n    strategy: \"\"             # full | incremental | CDC | log-based\n    incremental_key: \"\"      # column for incremental (e.g., updated_at)\n    watermark_storage: \"\"    # where to persist last-extracted position\n\n  transform:\n    engine: \"\"               # SQL | Spark | Python | dbt\n    stages:\n      - name: \"clean\"\n        operations: []       # dedupe, null handling, type casting, trimming\n      - name: \"conform\"\n        operations: []       # standardize codes, currencies, timezones\n      - name: \"enrich\"\n        operations: []       # lookups, calculations, derived fields\n      - name: \"aggregate\"\n        operations: []       # rollups, pivots, window functions\n\n  load:\n    target_system: \"\"\n    strategy: \"\"             # append | upsert | merge | truncate-reload | partition-swap\n    merge_keys: []\n    partition_key: \"\"\n    clustering_keys: []\n\n  quality_gates:\n    pre_load: []             # checks before writing\n    post_load: []            # checks after writing\n\n  error_handling:\n    strategy: \"\"             # fail-fast | dead-letter | retry | skip-and-alert\n    max_retries: 3\n    retry_delay_seconds: 300\n    alert_channels: []"
      },
      {
        "title": "Extraction Strategy Decision Tree",
        "body": "Is the source database?\n├── Yes → Does it support CDC?\n│   ├── Yes → Use CDC (Debezium, AWS DMS, Fivetran)\n│   │   Best for: high-volume, low-latency, minimal source impact\n│   └── No → Does it have a reliable updated_at column?\n│       ├── Yes → Incremental extraction on updated_at\n│       │   ⚠️ Won't catch hard deletes — add periodic full reconciliation\n│       └── No → Full extraction\n│           Only viable for small tables (<1M rows)\n├── Is it an API?\n│   ├── Supports webhooks? → Event-driven ingestion\n│   ├── Has cursor/pagination? → Incremental with cursor bookmark\n│   └── No pagination? → Full pull with rate-limit handling\n├── Is it files (S3, SFTP, email)?\n│   └── Event-triggered (S3 notification, file watcher)\n│       Validate: schema, completeness, filename pattern\n└── Is it streaming (Kafka, Kinesis, Pub/Sub)?\n    └── Consumer group with offset management\n        Key decisions: at-least-once vs exactly-once, consumer lag alerting"
      },
      {
        "title": "Load Strategy Decision",
        "body": "StrategyWhenTrade-offAppendEvent/log data, immutable factsSimple but grows forever — partition + retainUpsert/MergeDimension updates, SCD Type 1Handles updates but slower on large tablesTruncate-ReloadSmall tables (<1M), reference dataSimple but window of missing dataPartition SwapLarge fact tables, daily loadsAtomic, fast, but needs partition alignmentSoft DeleteNeed audit trail of deletionsAdds complexity to every downstream query"
      },
      {
        "title": "Idempotency Rules (NON-NEGOTIABLE)",
        "body": "Every pipeline MUST be re-runnable without side effects:\n\nUse MERGE/UPSERT, never blind INSERT for mutable data\nPartition-swap for immutable data — drop partition + reload\nStore watermarks externally — not in the pipeline code\nDedup at ingestion — use source natural keys\nTest by running twice — output must be identical both times"
      },
      {
        "title": "Quality Dimensions",
        "body": "DimensionDefinitionExample CheckCompletenessNo missing values where requiredNOT NULL on required fields, row count within rangeUniquenessNo unexpected duplicatesPrimary key uniqueness, natural key uniquenessValidityValues within expected domainEnum checks, range checks, regex patternsAccuracyData matches real-world truthCross-system reconciliation, manual spot checksFreshnessData arrives on timeMAX(loaded_at) > NOW() - INTERVAL '2 hours'ConsistencySame data agrees across systemsSum reconciliation between source and target"
      },
      {
        "title": "Quality Check Templates",
        "body": "-- Completeness: Required fields not null\nSELECT COUNT(*) AS null_violations\nFROM {table}\nWHERE {required_column} IS NULL;\n-- Threshold: 0\n\n-- Uniqueness: No duplicate primary keys\nSELECT {pk_column}, COUNT(*) AS dupe_count\nFROM {table}\nGROUP BY {pk_column}\nHAVING COUNT(*) > 1;\n-- Threshold: 0\n\n-- Freshness: Data arrived within SLA\nSELECT CASE\n  WHEN MAX({timestamp_col}) > CURRENT_TIMESTAMP - INTERVAL '{sla_hours} hours'\n  THEN 'PASS' ELSE 'FAIL'\nEND AS freshness_check\nFROM {table};\n\n-- Volume: Row count within expected range\nSELECT CASE\n  WHEN COUNT(*) BETWEEN {min_expected} AND {max_expected}\n  THEN 'PASS' ELSE 'FAIL'\nEND AS volume_check\nFROM {table}\nWHERE {partition_col} = '{run_date}';\n\n-- Referential: FK integrity\nSELECT COUNT(*) AS orphan_count\nFROM {fact_table} f\nLEFT JOIN {dim_table} d ON f.{fk} = d.{pk}\nWHERE d.{pk} IS NULL;\n-- Threshold: 0\n\n-- Distribution: No unexpected skew\nSELECT {column}, COUNT(*) AS cnt,\n  ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS pct\nFROM {table}\nGROUP BY {column}\nORDER BY cnt DESC;\n-- Alert if any single value > {max_pct}%\n\n-- Cross-system reconciliation\nSELECT\n  (SELECT SUM(amount) FROM source_system.orders WHERE date = '{date}') AS source_total,\n  (SELECT SUM(amount) FROM warehouse.fct_orders WHERE order_date = '{date}') AS target_total,\n  ABS(source_total - target_total) AS variance;\n-- Threshold: variance < 0.01 * source_total (1%)"
      },
      {
        "title": "Data Contract Template",
        "body": "contract:\n  name: \"\"\n  version: \"\"\n  owner: \"\"                    # team responsible for producing this data\n  consumers: []                # teams consuming this data\n  sla:\n    freshness_hours: 0\n    availability_percent: 99.9\n    support_hours: \"\"          # business-hours | 24x7\n\n  schema:\n    - column: \"\"\n      type: \"\"\n      nullable: false\n      description: \"\"\n      business_definition: \"\"\n      pii: false\n      checks:\n        - type: \"\"             # not_null | unique | range | enum | regex | custom\n          params: {}\n\n  breaking_change_policy: \"\"   # notify-30-days | version-bump | never-break\n  notification_channel: \"\""
      },
      {
        "title": "Quality Severity Levels",
        "body": "LevelDefinitionResponseP0 — CriticalData corruption, wrong numbers in production dashboards, compliance data wrongStop pipeline, alert immediately, rollback if possibleP1 — HighMissing data for key reports, SLA breach, >5% of records affectedAlert team, fix within 4 hours, post-mortem requiredP2 — MediumNon-critical field quality, <1% records affected, no downstream impactFix in next sprint, add monitoring to prevent recurrenceP3 — LowCosmetic issues, edge cases, non-critical dataBacklog, fix when convenient"
      },
      {
        "title": "SQL Optimization Checklist",
        "body": "ProblemFixImpactFull table scanAdd/use partition pruning10-100x fasterLarge joinsPre-aggregate before joining5-50x fasterSELECT *Select only needed columns2-10x faster (columnar stores)Correlated subqueryRewrite as JOIN or window function10-100x fasterDISTINCT on large resultFix upstream duplication instead2-5x fasterORDER BY without LIMITAdd LIMIT or remove if not neededPrevents memory spillsString operations in WHEREPre-compute, use lookup tableEnables index usageMultiple passes over same dataCombine with CASE WHEN + GROUP BY2-5x fasterNOT IN with NULLsUse NOT EXISTS or LEFT JOIN IS NULLCorrectness + performance"
      },
      {
        "title": "Spark Optimization Guide",
        "body": "ProblemSolutionShuffle-heavy joinsBroadcast small table (broadcast(df)) if <100MBData skewSalt the skewed key: add random prefix, join on salted key, aggregateSmall filesCoalesce output: .coalesce(target_files) or use adaptive query executionToo many partitionsspark.sql.shuffle.partitions = 2-3x cluster coresOOM errorsIncrease spark.executor.memory, reduce partition size, spill to diskSlow writesUse Parquet with snappy, partition by date, avoid small writesRepeated computation.cache() or .persist() DataFrames used >1 timeComplex transformationsPush down predicates, filter early, select early"
      },
      {
        "title": "Partitioning Strategy",
        "body": "Data TypePartition KeyWhyTransactional/eventDate (daily or monthly)Most queries filter by time rangeMulti-tenantTenant ID + dateIsolate tenant queries, time-range pruningGeospatialRegion + dateRegional queries are commonLog dataDate + hourHigh volume needs finer partitionsReference/dimensionDon't partitionToo small, full scan is fine\n\nRules:\n\nTarget 100MB-1GB per partition (compressed)\n<10,000 total partitions per table\nNever partition on high-cardinality columns (user_id)\nAlways include partition key in WHERE clauses"
      },
      {
        "title": "Data Classification",
        "body": "LevelExamplesControlsPublicProduct catalog, published statsNo restrictionsInternalAggregated metrics, non-PII analyticsAuth required, audit loggingConfidentialCustomer PII, financial records, HR dataEncryption, column-level access, maskingRestrictedSSN, payment cards, health records, passwordsEncryption at rest + transit, tokenization, audit every access, retention limits"
      },
      {
        "title": "PII Handling Rules",
        "body": "Identify: Scan all sources for PII columns (name, email, phone, SSN, DOB, address, IP)\nClassify: Tag each with sensitivity level\nMinimize: Only ingest PII you actually need\nProtect:\n\nHash or tokenize in staging (SHA-256 with salt for pseudonymization)\nDynamic masking for non-privileged users\nColumn-level encryption for restricted data\n\n\nRetain: Auto-delete after retention period\nAudit: Log every query touching PII columns\nRight to delete: Build a deletion pipeline that propagates across all derived tables"
      },
      {
        "title": "Data Catalog Entry Template",
        "body": "dataset:\n  name: \"\"\n  description: \"\"\n  owner_team: \"\"\n  steward: \"\"                  # person responsible for quality\n  domain: \"\"                   # sales | marketing | finance | product | engineering\n  tier: \"\"                     # gold (trusted) | silver (cleaned) | bronze (raw)\n  \n  lineage:\n    sources: []                # upstream datasets/systems\n    transformations: \"\"        # brief description of key transforms\n    downstream: []             # who consumes this\n  \n  refresh:\n    schedule: \"\"\n    sla_hours: 0\n    last_successful_run: \"\"\n  \n  quality:\n    tests: []                  # list of quality checks\n    last_score: 0              # 0-100\n    known_issues: []\n  \n  access:\n    classification: \"\"         # public | internal | confidential | restricted\n    pii_columns: []\n    access_request_process: \"\" # how to get access\n  \n  usage:\n    avg_daily_queries: 0\n    top_consumers: []\n    cost_monthly_usd: 0"
      },
      {
        "title": "Pipeline Health Dashboard",
        "body": "dashboard:\n  pipeline_metrics:\n    - metric: \"Pipeline Success Rate\"\n      formula: \"successful_runs / total_runs * 100\"\n      target: \">99%\"\n      alert_threshold: \"<95%\"\n\n    - metric: \"Average Runtime\"\n      formula: \"avg(end_time - start_time) over 7 days\"\n      target: \"<SLA\"\n      alert_threshold: \">80% of SLA\"\n\n    - metric: \"Data Freshness\"\n      formula: \"NOW() - MAX(loaded_at)\"\n      target: \"<SLA hours\"\n      alert_threshold: \">SLA\"\n\n    - metric: \"Data Volume Variance\"\n      formula: \"abs(today_rows - avg_7d_rows) / avg_7d_rows * 100\"\n      target: \"<20%\"\n      alert_threshold: \">50%\"\n\n    - metric: \"Quality Check Pass Rate\"\n      formula: \"passed_checks / total_checks * 100\"\n      target: \"100%\"\n      alert_threshold: \"<95%\"\n\n    - metric: \"Failed Pipeline Count\"\n      formula: \"count where status = failed in last 24h\"\n      target: \"0\"\n      alert_threshold: \">0\"\n\n    - metric: \"Backfill Queue\"\n      formula: \"count of pending backfill requests\"\n      target: \"0\"\n      alert_threshold: \">5\"\n\n    - metric: \"Infrastructure Cost\"\n      formula: \"compute + storage + egress\"\n      target: \"<budget\"\n      alert_threshold: \">110% budget\""
      },
      {
        "title": "Alert Severity",
        "body": "SeverityConditionResponse TimeExampleP0Revenue/compliance impacting15 minPayment pipeline down, regulatory report delayedP1Business-critical dashboard stale1 hourExecutive dashboard >4h staleP2Non-critical pipeline failed4 hoursMarketing attribution delayedP3Warning/degradationNext business dayPipeline 80% of SLA, minor quality drift"
      },
      {
        "title": "Structured Logging Standard",
        "body": "Every pipeline run MUST log:\n\n{\n  \"pipeline_name\": \"\",\n  \"run_id\": \"\",\n  \"started_at\": \"\",\n  \"completed_at\": \"\",\n  \"status\": \"success|failed|partial\",\n  \"stage\": \"\",\n  \"rows_extracted\": 0,\n  \"rows_transformed\": 0,\n  \"rows_loaded\": 0,\n  \"rows_rejected\": 0,\n  \"quality_checks_passed\": 0,\n  \"quality_checks_failed\": 0,\n  \"duration_seconds\": 0,\n  \"error_message\": \"\",\n  \"watermark_before\": \"\",\n  \"watermark_after\": \"\"\n}"
      },
      {
        "title": "Pipeline Test Pyramid",
        "body": "LayerWhat to TestHowWhenUnitIndividual transforms, business logicpytest with fixtures, dbt unit testsEvery PRIntegrationSource connectivity, schema compatibilityTest against staging/dev environmentDaily + PRContractSchema hasn't changed, data types stableSchema registry, contract testsEvery pipeline runData QualityCompleteness, uniqueness, freshness, validityQuality framework checksEvery pipeline runE2EFull pipeline produces correct outputGolden dataset comparisonWeekly + releasePerformanceRuntime within SLA, no regressionBenchmark against historical runsWeekly"
      },
      {
        "title": "dbt Testing Checklist",
        "body": "# For every model, define at minimum:\nmodels:\n  - name: fct_orders\n    columns:\n      - name: order_id\n        tests:\n          - unique\n          - not_null\n      - name: customer_id\n        tests:\n          - not_null\n          - relationships:\n              to: ref('dim_customers')\n              field: customer_id\n      - name: order_amount\n        tests:\n          - not_null\n          - dbt_utils.accepted_range:\n              min_value: 0\n              max_value: 1000000\n      - name: order_status\n        tests:\n          - accepted_values:\n              values: ['pending', 'confirmed', 'shipped', 'delivered', 'cancelled']\n      - name: ordered_at\n        tests:\n          - not_null\n          - dbt_utils.recency:\n              datepart: day\n              field: ordered_at\n              interval: 2"
      },
      {
        "title": "Backfill Protocol",
        "body": "When you need to reprocess historical data:\n\nScope: Define exact date range and affected tables\nImpact assessment: What downstream models/dashboards will be affected?\nCommunication: Notify consumers of temporary data inconsistency\nIsolation: Run backfill in separate compute to avoid impacting current pipelines\nValidation: Compare row counts and key metrics pre/post backfill\nExecution: Process in reverse-chronological order (most recent first)\nMonitoring: Watch for resource spikes, duplicate creation\nVerification: Reconcile against source after completion\nDocumentation: Log what was backfilled, why, and any anomalies found"
      },
      {
        "title": "Cloud Cost Reduction Strategies",
        "body": "StrategySavingsEffortRight-size compute (auto-scaling)20-40%LowUse spot/preemptible instances for batch60-80%MediumCompress data (Parquet + Snappy/Zstd)50-80% storageLowLifecycle policies (hot → warm → cold → archive)40-70% storageLowEliminate unused tables/pipelines10-30%LowOptimize query patterns (partition pruning)30-60% computeMediumReserved capacity for steady-state30-50%MediumCache expensive queries20-50% computeMedium"
      },
      {
        "title": "Cost Allocation Template",
        "body": "cost_tracking:\n  by_pipeline:\n    - pipeline: \"\"\n      compute_monthly_usd: 0\n      storage_monthly_usd: 0\n      egress_monthly_usd: 0\n      total: 0\n      cost_per_row: 0        # total / rows_processed\n      business_value: \"\"     # what revenue/decision does this enable?\n      roi_justified: true    # is the cost worth it?\n\n  optimization_opportunities:\n    - description: \"\"\n      estimated_savings_usd: 0\n      effort: \"\"             # low | medium | high\n      priority: 0            # 1 = do now"
      },
      {
        "title": "Cost Red Flags",
        "body": "Single pipeline >30% of total spend\nCost per row increasing month-over-month\nTables with 0 queries in 30 days\nDev/staging environments running 24/7\nFull table scans on >1TB tables\nUncompressed data in cloud storage\nCross-region data transfer"
      },
      {
        "title": "Pipeline Failure Triage",
        "body": "Pipeline failed →\n1. Check error message in logs\n   ├── Connection timeout → Check source availability, network, credentials\n   ├── Schema mismatch → Source schema changed → update extract + notify\n   ├── Data quality check failed → Investigate source data, check for anomalies\n   ├── Out of memory → Increase resources or optimize query\n   ├── Permission denied → Check IAM roles, token expiry\n   ├── Duplicate key violation → Check idempotency, investigate source dupes\n   └── Timeout (SLA breach) → Check data volume spike, query plan, cluster health\n\n2. Determine impact\n   ├── What dashboards/reports are affected?\n   ├── What's the data freshness SLA?\n   └── Who needs to be notified?\n\n3. Fix\n   ├── Transient (network, timeout) → Retry\n   ├── Data issue → Fix source data, re-run with quality gate override if safe\n   ├── Schema change → Update pipeline, backfill if needed\n   └── Infrastructure → Scale up, file ticket with cloud provider\n\n4. Post-fix\n   ├── Verify data correctness\n   ├── Update runbook with new failure mode\n   └── Add monitoring/alerting to catch earlier next time"
      },
      {
        "title": "Schema Change Management",
        "body": "When a source system changes schema:\n\nDetect: Schema comparison check in extraction pipeline (hash schema, compare to registered)\nClassify:\n\nAdditive (new column): Usually safe — add to pipeline, backfill if needed\nRename: Map old → new in transform, update downstream\nType change: Assess compatibility, may need cast or historical rebuild\nColumn removed: Critical — breaks queries, need immediate attention\n\n\nTest: Run pipeline in dry-run mode with new schema\nDeploy: Update transforms, quality checks, documentation\nCommunicate: Notify downstream consumers via data contract channel"
      },
      {
        "title": "Disaster Recovery",
        "body": "ScenarioRPORTORecovery StepsPipeline code lost0 (git)1hRedeploy from git, restore orchestrator stateWarehouse data corruptedVaries4hRestore from Time Travel/snapshot, re-run affected pipelinesSource system downN/AWaitQueue extractions, catch up with incremental once restoredCloud region outage24h8hFailover to DR region if configured, else waitCredential compromise02hRotate all credentials, audit access logs, re-run affected pipelines"
      },
      {
        "title": "Slowly Changing Dimension Type 2 (SQL Template)",
        "body": "-- Merge pattern for SCD Type 2\nMERGE INTO dim_customer AS target\nUSING (\n  SELECT * FROM stg_customers\n  WHERE updated_at > (SELECT MAX(valid_from) FROM dim_customer)\n) AS source\nON target.customer_natural_key = source.customer_id\n   AND target.is_current = TRUE\n\n-- Update: close old record\nWHEN MATCHED AND (\n  target.customer_name != source.name OR\n  target.customer_status != source.status\n  -- list all Type 2 tracked columns\n) THEN UPDATE SET\n  is_current = FALSE,\n  valid_to = CURRENT_TIMESTAMP\n\n-- Insert: new record (both new customers and changed ones)\nWHEN NOT MATCHED THEN INSERT (\n  customer_natural_key, customer_name, customer_status,\n  valid_from, valid_to, is_current\n) VALUES (\n  source.customer_id, source.name, source.status,\n  CURRENT_TIMESTAMP, '9999-12-31', TRUE\n);\n\n-- Then insert new versions of changed records\nINSERT INTO dim_customer (\n  customer_natural_key, customer_name, customer_status,\n  valid_from, valid_to, is_current\n)\nSELECT customer_id, name, status,\n  CURRENT_TIMESTAMP, '9999-12-31', TRUE\nFROM stg_customers s\nWHERE EXISTS (\n  SELECT 1 FROM dim_customer d\n  WHERE d.customer_natural_key = s.customer_id\n    AND d.is_current = FALSE\n    AND d.valid_to = CURRENT_TIMESTAMP\n);"
      },
      {
        "title": "CDC with Debezium (Architecture Pattern)",
        "body": "Source DB → Debezium Connector → Kafka Topic → \n  ├── Stream processor (Flink/Spark Streaming) → Target DB\n  ├── S3 sink connector → Data Lake (raw)\n  └── Elasticsearch sink → Search index\n\nKey decisions:\n\nTopic per table or single topic: Per table (easier routing, independent scaling)\nSchema registry: Always use (Confluent Schema Registry or AWS Glue)\nSerialization: Avro (compact + schema evolution) or Protobuf (strict + fast)\nOffset management: Connector manages; monitor consumer lag"
      },
      {
        "title": "Feature Store Pattern",
        "body": "feature_store:\n  entity: \"customer\"\n  entity_key: \"customer_id\"\n  \n  features:\n    - name: \"total_orders_30d\"\n      description: \"Total orders in last 30 days\"\n      type: \"INT\"\n      source: \"fct_orders\"\n      computation: \"batch\"      # batch | streaming | on-demand\n      freshness: \"daily\"\n      ttl_hours: 48\n    \n    - name: \"avg_order_value_90d\"\n      description: \"Average order value last 90 days\"\n      type: \"FLOAT\"\n      source: \"fct_orders\"\n      computation: \"batch\"\n      freshness: \"daily\"\n      ttl_hours: 48\n    \n    - name: \"last_login_minutes_ago\"\n      description: \"Minutes since last login event\"\n      type: \"INT\"\n      source: \"events_stream\"\n      computation: \"streaming\"\n      freshness: \"real-time\"\n      ttl_hours: 1\n  \n  serving:\n    online: true               # low-latency feature serving (Redis/DynamoDB)\n    offline: true              # batch feature retrieval for training\n    point_in_time_correct: true  # prevent feature leakage in ML training"
      },
      {
        "title": "Data Mesh Principles",
        "body": "If operating at scale (>5 data teams):\n\nDomain ownership: Each business domain owns its data products (not central data team)\nData as a product: Treat datasets like products — SLAs, documentation, discoverability\nSelf-serve platform: Central team builds the platform, domains build on top\nFederated governance: Standards and interoperability maintained centrally, implementation decentralized\n\nWhen NOT to use Data Mesh:\n\n<5 data producers/consumers\nSmall team (<20 engineers total)\nSingle business domain\nEarly-stage company (over-engineering)"
      },
      {
        "title": "Quality Scoring Rubric (0-100)",
        "body": "DimensionWeightScoringPipeline Reliability200=frequent failures, 10=some failures with manual recovery, 20=99.5%+ success rate with auto-retryData Quality200=no checks, 10=basic null/unique checks, 20=comprehensive quality framework with contractsPerformance150=regularly breaches SLA, 8=meets SLA, 15=well under SLA with optimizationDocumentation100=none, 5=basic README, 10=full catalog entries with lineage and business definitionsMonitoring150=no alerts, 8=failure alerts only, 15=proactive monitoring with dashboards and anomaly detectionTesting100=no tests, 5=basic smoke tests, 10=full test pyramid (unit+integration+contract+E2E)Cost Efficiency100=no cost tracking, 5=tracked, 10=optimized with ROI justification per pipeline\n\nScoring guide:\n\n0-40: Critical gaps — prioritize pipeline reliability and data quality\n41-60: Functional but fragile — add monitoring, testing, documentation\n61-80: Solid — optimize performance, cost, governance\n81-100: Excellent — maintain, innovate, mentor"
      },
      {
        "title": "Timezone Traps",
        "body": "Store everything in UTC. Convert only at presentation layer\nEvent timestamps: use event time, not processing time\nDaylight saving: TIMESTAMP WITH TIME ZONE, never WITHOUT\nLate-arriving data: watermark strategy + allowed lateness window"
      },
      {
        "title": "Late-Arriving Data",
        "body": "Define maximum acceptable lateness per source\nReprocess affected partitions when late data arrives\nTrack late arrival rate as a quality metric\nConsider separate \"late data\" pipeline that patches in"
      },
      {
        "title": "Exactly-Once Processing",
        "body": "True exactly-once is expensive. Most systems need at-least-once + idempotent writes\nUse transaction IDs or natural keys for deduplication\nKafka: use idempotent producer + transactional consumer\nDatabase: MERGE/UPSERT on natural key"
      },
      {
        "title": "Schema Evolution",
        "body": "Forward compatible: New code reads old data (safe to deploy new readers first)\nBackward compatible: Old code reads new data (safe to deploy new writers first)\nFull compatible: Both directions (safest, most restrictive)\nUse Avro or Protobuf with schema registry for streaming data"
      },
      {
        "title": "Multi-Tenant Data",
        "body": "Tenant ID in every table, every query, every log\nRow-level security in warehouse\nSeparate compute per tenant (or at least isolation)\nNever join across tenants without explicit business reason\nTenant-aware backfill (don't rebuild all tenants for one tenant's issue)"
      },
      {
        "title": "Data Lake Anti-Patterns",
        "body": "\"Data Swamp\": ingesting everything with no organization or catalog → only ingest what has a known consumer\nSmall files: thousands of <1MB files → compact regularly (target 100MB-1GB)\nNo table format: raw Parquet/CSV without Delta/Iceberg → loses ACID, schema evolution, time travel\nNo access controls: single bucket, everyone admin → implement IAM per domain/team"
      },
      {
        "title": "Natural Language Commands",
        "body": "Say any of these to activate specific workflows:\n\n\"Design a data pipeline for [source] to [target]\" → Full pipeline template with extraction strategy, transforms, load pattern, quality checks\n\"Model [entity/domain] for analytics\" → Dimensional model with fact/dimension tables, grain, measures, SCD types\n\"Optimize this query/pipeline\" → Performance analysis with specific recommendations\n\"Set up data quality for [table/pipeline]\" → Quality framework with checks, contracts, monitoring\n\"Audit our data infrastructure\" → Full assessment using scoring rubric\n\"Help with [Spark/Airflow/dbt/Kafka] issue\" → Troubleshooting with technology-specific guidance\n\"Design a data catalog for our org\" → Catalog template with governance, classification, lineage\n\"Plan a data migration from [old] to [new]\" → Migration plan with validation, rollback, parallel-run\n\"Set up monitoring for our pipelines\" → Dashboard template with alerts, logging standards, runbooks\n\"Review our data costs\" → Cost analysis with optimization strategies and ROI framework\n\"Handle schema change in [source]\" → Change management protocol with impact assessment\n\"Backfill [table] for [date range]\" → Backfill protocol with validation and communication plan"
      }
    ],
    "body": "Data Engineering Command Center\n\nComplete methodology for designing, building, operating, and scaling data pipelines and infrastructure. Zero dependencies — pure agent skill.\n\nPhase 1: Data Architecture Assessment\n\nBefore building anything, understand the landscape.\n\nArchitecture Brief\nproject_name: \"\"\nbusiness_context: \"\"\ndata_consumers:\n  - team: \"\"\n    use_case: \"\"          # analytics | ML | operational | reporting | reverse-ETL\n    latency_requirement: \"\"  # real-time (<1s) | near-real-time (<5min) | batch (hourly+)\n    query_pattern: \"\"     # ad-hoc | scheduled | API | dashboard\n\ncurrent_state:\n  sources: []             # list every system producing data\n  storage: []             # where data lives today\n  pain_points: []         # what's broken, slow, unreliable\n  data_volume:\n    current_gb_per_day: 0\n    growth_rate_percent: 0\n    retention_months: 0\n\nconstraints:\n  budget_monthly_usd: 0\n  team_size: 0\n  skill_level: \"\"         # junior | mid | senior | mixed\n  compliance: []          # GDPR, HIPAA, SOX, PCI, none\n  cloud_provider: \"\"      # AWS | GCP | Azure | multi | on-prem\n\nArchitecture Pattern Decision Matrix\nSignal\tPattern\tWhen to Use\nAll consumers need data hourly+\tBatch ETL\tReporting, warehousing, most analytics\nSome need <5 min latency\tMicro-batch\tDashboard freshness, near-real-time analytics\nEvents need <1s processing\tStreaming\tFraud detection, real-time pricing, alerts\nNeed both batch + streaming\tLambda\tWhen batch accuracy + real-time speed both matter\nWant to simplify Lambda\tKappa\tWhen you can reprocess from stream replay\nData lake + warehouse combined\tLakehouse\tWhen you need both cheap storage + fast SQL\nSources change independently\tData Mesh\tLarge orgs, domain-owned data, >5 teams\nML is primary consumer\tFeature Store\tML-heavy orgs with feature reuse needs\nTechnology Selection Guide\nOrchestration\nTool\tBest For\tAvoid When\nAirflow\tComplex DAGs, Python-native teams, mature ecosystem\tSimple pipelines (<5 tasks)\nDagster\tSoftware-defined assets, strong typing, dev experience\tLegacy team resistant to new paradigms\nPrefect\tDynamic workflows, cloud-native, Python-first\tNeed on-prem with no cloud dependency\ndbt\tSQL transformations, ELT, analytics engineering\tNon-SQL transforms, streaming\nTemporal\tLong-running workflows, retry-heavy, microservices\tSimple ETL, small teams\nCron + scripts\t<3 pipelines, solo engineer, simple schedules\tAnything with dependencies or retries\nProcessing\nTool\tBest For\tAvoid When\nSpark\t>100GB, complex transforms, ML pipelines\t<10GB (overkill), real-time streaming\nDuckDB\tLocal analytics, <100GB, SQL on files\tDistributed processing, production streaming\nPolars\tSingle-node, Rust-speed, <50GB, DataFrames\tDistributed, need Spark ecosystem\nPandas\t<1GB, quick analysis, prototyping\tProduction pipelines, anything >5GB\nFlink\tTrue streaming, event-time processing\tBatch-only, small team (steep learning curve)\nSQL (warehouse)\tELT in Snowflake/BigQuery/Redshift\tComplex ML transforms, binary data\nStorage\nTool\tBest For\tAvoid When\nSnowflake\tAnalytics, separation of compute/storage, multi-cloud\tTight budget, real-time OLTP\nBigQuery\tGCP-native, serverless, large-scale analytics\tMulti-cloud, need fine-grained cost control\nRedshift\tAWS-native, existing AWS ecosystem\tElastic scaling needs, multi-cloud\nDatabricks\tML + analytics unified, Spark-native, lakehouse\tPure SQL analytics, small data\nPostgreSQL\tOLTP + light analytics, <500GB, budget-conscious\t>1TB analytics, real-time dashboards on large data\nS3/GCS/ADLS\tRaw data lake, cheap storage, any format\tDirect SQL queries (need compute layer)\nDelta Lake/Iceberg\tTable format on data lake, ACID on files\tSimple file storage, no lakehouse need\nPhase 2: Data Modeling\nModeling Methodology Decision\nApproach\tBest For\tKey Concept\nKimball (Dimensional)\tBI/reporting, star schemas\tFacts + Dimensions, business-process-centric\nInmon (3NF)\tEnterprise data warehouse, single source of truth\tNormalized, subject-area-centric\nData Vault 2.0\tAgile warehousing, auditability, multiple sources\tHubs + Links + Satellites, insert-only\nOne Big Table (OBT)\tSimple analytics, few joins, dashboard performance\tPre-joined, denormalized, fast queries\nActivity Schema\tEvent analytics, product analytics\tEntity + Activity + Feature columns\nDimensional Model Template\nfact_table:\n  name: \"fact_[business_process]\"\n  grain: \"\"                    # one row = one [what]?\n  grain_statement: \"One row per [transaction/event/snapshot] at [time grain]\"\n  measures:\n    - name: \"\"\n      type: \"\"                 # additive | semi-additive | non-additive\n      aggregation: \"\"          # SUM | AVG | COUNT | MIN | MAX | COUNT DISTINCT\n      business_definition: \"\"\n  degenerate_dimensions: []    # IDs stored in fact (order_number, invoice_id)\n  foreign_keys: []             # links to dimension tables\n\ndimensions:\n  - name: \"dim_[entity]\"\n    type: \"\"                   # Type 1 (overwrite) | Type 2 (history) | Type 3 (previous value)\n    natural_key: \"\"            # business key from source\n    surrogate_key: \"\"          # warehouse-generated key\n    attributes:\n      - name: \"\"\n        source: \"\"\n        scd_type: \"\"           # 1 | 2 | 3\n    hierarchy: []              # e.g., [country, region, city, store]\n\nSCD Type Decision Guide\nScenario\tSCD Type\tImplementation\nDon't care about history\tType 1\tUPDATE in place\nNeed full history\tType 2\tNew row + valid_from/valid_to + is_current flag\nOnly need previous value\tType 3\tAdd previous_[column]\nTrack changes with timestamps\tType 4\tMini-dimension (history table)\nHybrid: some attrs Type 1, some Type 2\tType 6\tCombine 1+2+3 in one table\n\nDefault recommendation: Type 2 for anything business-critical (customer status, product price, employee department). Type 1 for everything else.\n\nNaming Conventions\nObject\tConvention\tExample\nRaw/staging tables\traw_[source]_[table]\traw_stripe_payments\nStaging models\tstg_[source]__[entity]\tstg_stripe__payments\nIntermediate models\tint_[entity]_[verb]\tint_orders_pivoted\nMart/fact tables\tfct_[business_process]\tfct_orders\nDimension tables\tdim_[entity]\tdim_customers\nMetrics/aggregates\tmrt_[domain]_[metric]\tmrt_sales_daily\nSnapshots\tsnp_[entity]_[grain]\tsnp_inventory_daily\nColumns: boolean\tis_[state] or has_[thing]\tis_active, has_subscription\nColumns: timestamp\t[event]_at\tcreated_at, shipped_at\nColumns: date\t[event]_date\torder_date\nColumns: ID\t[entity]_id\tcustomer_id\nColumns: amount\t[thing]_amount\torder_amount\nColumns: count\t[thing]_count\tline_item_count\nPhase 3: Pipeline Design Patterns\nUniversal Pipeline Template\npipeline:\n  name: \"\"\n  owner: \"\"\n  schedule: \"\"               # cron expression\n  sla_minutes: 0             # max acceptable runtime\n  tier: \"\"                   # 1 (critical) | 2 (important) | 3 (nice-to-have)\n\n  extract:\n    source_system: \"\"\n    connection: \"\"\n    strategy: \"\"             # full | incremental | CDC | log-based\n    incremental_key: \"\"      # column for incremental (e.g., updated_at)\n    watermark_storage: \"\"    # where to persist last-extracted position\n\n  transform:\n    engine: \"\"               # SQL | Spark | Python | dbt\n    stages:\n      - name: \"clean\"\n        operations: []       # dedupe, null handling, type casting, trimming\n      - name: \"conform\"\n        operations: []       # standardize codes, currencies, timezones\n      - name: \"enrich\"\n        operations: []       # lookups, calculations, derived fields\n      - name: \"aggregate\"\n        operations: []       # rollups, pivots, window functions\n\n  load:\n    target_system: \"\"\n    strategy: \"\"             # append | upsert | merge | truncate-reload | partition-swap\n    merge_keys: []\n    partition_key: \"\"\n    clustering_keys: []\n\n  quality_gates:\n    pre_load: []             # checks before writing\n    post_load: []            # checks after writing\n\n  error_handling:\n    strategy: \"\"             # fail-fast | dead-letter | retry | skip-and-alert\n    max_retries: 3\n    retry_delay_seconds: 300\n    alert_channels: []\n\nExtraction Strategy Decision Tree\nIs the source database?\n├── Yes → Does it support CDC?\n│   ├── Yes → Use CDC (Debezium, AWS DMS, Fivetran)\n│   │   Best for: high-volume, low-latency, minimal source impact\n│   └── No → Does it have a reliable updated_at column?\n│       ├── Yes → Incremental extraction on updated_at\n│       │   ⚠️ Won't catch hard deletes — add periodic full reconciliation\n│       └── No → Full extraction\n│           Only viable for small tables (<1M rows)\n├── Is it an API?\n│   ├── Supports webhooks? → Event-driven ingestion\n│   ├── Has cursor/pagination? → Incremental with cursor bookmark\n│   └── No pagination? → Full pull with rate-limit handling\n├── Is it files (S3, SFTP, email)?\n│   └── Event-triggered (S3 notification, file watcher)\n│       Validate: schema, completeness, filename pattern\n└── Is it streaming (Kafka, Kinesis, Pub/Sub)?\n    └── Consumer group with offset management\n        Key decisions: at-least-once vs exactly-once, consumer lag alerting\n\nLoad Strategy Decision\nStrategy\tWhen\tTrade-off\nAppend\tEvent/log data, immutable facts\tSimple but grows forever — partition + retain\nUpsert/Merge\tDimension updates, SCD Type 1\tHandles updates but slower on large tables\nTruncate-Reload\tSmall tables (<1M), reference data\tSimple but window of missing data\nPartition Swap\tLarge fact tables, daily loads\tAtomic, fast, but needs partition alignment\nSoft Delete\tNeed audit trail of deletions\tAdds complexity to every downstream query\nIdempotency Rules (NON-NEGOTIABLE)\n\nEvery pipeline MUST be re-runnable without side effects:\n\nUse MERGE/UPSERT, never blind INSERT for mutable data\nPartition-swap for immutable data — drop partition + reload\nStore watermarks externally — not in the pipeline code\nDedup at ingestion — use source natural keys\nTest by running twice — output must be identical both times\nPhase 4: Data Quality Framework\nQuality Dimensions\nDimension\tDefinition\tExample Check\nCompleteness\tNo missing values where required\tNOT NULL on required fields, row count within range\nUniqueness\tNo unexpected duplicates\tPrimary key uniqueness, natural key uniqueness\nValidity\tValues within expected domain\tEnum checks, range checks, regex patterns\nAccuracy\tData matches real-world truth\tCross-system reconciliation, manual spot checks\nFreshness\tData arrives on time\tMAX(loaded_at) > NOW() - INTERVAL '2 hours'\nConsistency\tSame data agrees across systems\tSum reconciliation between source and target\nQuality Check Templates\n-- Completeness: Required fields not null\nSELECT COUNT(*) AS null_violations\nFROM {table}\nWHERE {required_column} IS NULL;\n-- Threshold: 0\n\n-- Uniqueness: No duplicate primary keys\nSELECT {pk_column}, COUNT(*) AS dupe_count\nFROM {table}\nGROUP BY {pk_column}\nHAVING COUNT(*) > 1;\n-- Threshold: 0\n\n-- Freshness: Data arrived within SLA\nSELECT CASE\n  WHEN MAX({timestamp_col}) > CURRENT_TIMESTAMP - INTERVAL '{sla_hours} hours'\n  THEN 'PASS' ELSE 'FAIL'\nEND AS freshness_check\nFROM {table};\n\n-- Volume: Row count within expected range\nSELECT CASE\n  WHEN COUNT(*) BETWEEN {min_expected} AND {max_expected}\n  THEN 'PASS' ELSE 'FAIL'\nEND AS volume_check\nFROM {table}\nWHERE {partition_col} = '{run_date}';\n\n-- Referential: FK integrity\nSELECT COUNT(*) AS orphan_count\nFROM {fact_table} f\nLEFT JOIN {dim_table} d ON f.{fk} = d.{pk}\nWHERE d.{pk} IS NULL;\n-- Threshold: 0\n\n-- Distribution: No unexpected skew\nSELECT {column}, COUNT(*) AS cnt,\n  ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS pct\nFROM {table}\nGROUP BY {column}\nORDER BY cnt DESC;\n-- Alert if any single value > {max_pct}%\n\n-- Cross-system reconciliation\nSELECT\n  (SELECT SUM(amount) FROM source_system.orders WHERE date = '{date}') AS source_total,\n  (SELECT SUM(amount) FROM warehouse.fct_orders WHERE order_date = '{date}') AS target_total,\n  ABS(source_total - target_total) AS variance;\n-- Threshold: variance < 0.01 * source_total (1%)\n\nData Contract Template\ncontract:\n  name: \"\"\n  version: \"\"\n  owner: \"\"                    # team responsible for producing this data\n  consumers: []                # teams consuming this data\n  sla:\n    freshness_hours: 0\n    availability_percent: 99.9\n    support_hours: \"\"          # business-hours | 24x7\n\n  schema:\n    - column: \"\"\n      type: \"\"\n      nullable: false\n      description: \"\"\n      business_definition: \"\"\n      pii: false\n      checks:\n        - type: \"\"             # not_null | unique | range | enum | regex | custom\n          params: {}\n\n  breaking_change_policy: \"\"   # notify-30-days | version-bump | never-break\n  notification_channel: \"\"\n\nQuality Severity Levels\nLevel\tDefinition\tResponse\nP0 — Critical\tData corruption, wrong numbers in production dashboards, compliance data wrong\tStop pipeline, alert immediately, rollback if possible\nP1 — High\tMissing data for key reports, SLA breach, >5% of records affected\tAlert team, fix within 4 hours, post-mortem required\nP2 — Medium\tNon-critical field quality, <1% records affected, no downstream impact\tFix in next sprint, add monitoring to prevent recurrence\nP3 — Low\tCosmetic issues, edge cases, non-critical data\tBacklog, fix when convenient\nPhase 5: Performance Optimization\nSQL Optimization Checklist\nProblem\tFix\tImpact\nFull table scan\tAdd/use partition pruning\t10-100x faster\nLarge joins\tPre-aggregate before joining\t5-50x faster\nSELECT *\tSelect only needed columns\t2-10x faster (columnar stores)\nCorrelated subquery\tRewrite as JOIN or window function\t10-100x faster\nDISTINCT on large result\tFix upstream duplication instead\t2-5x faster\nORDER BY without LIMIT\tAdd LIMIT or remove if not needed\tPrevents memory spills\nString operations in WHERE\tPre-compute, use lookup table\tEnables index usage\nMultiple passes over same data\tCombine with CASE WHEN + GROUP BY\t2-5x faster\nNOT IN with NULLs\tUse NOT EXISTS or LEFT JOIN IS NULL\tCorrectness + performance\nSpark Optimization Guide\nProblem\tSolution\nShuffle-heavy joins\tBroadcast small table (broadcast(df)) if <100MB\nData skew\tSalt the skewed key: add random prefix, join on salted key, aggregate\nSmall files\tCoalesce output: .coalesce(target_files) or use adaptive query execution\nToo many partitions\tspark.sql.shuffle.partitions = 2-3x cluster cores\nOOM errors\tIncrease spark.executor.memory, reduce partition size, spill to disk\nSlow writes\tUse Parquet with snappy, partition by date, avoid small writes\nRepeated computation\t.cache() or .persist() DataFrames used >1 time\nComplex transformations\tPush down predicates, filter early, select early\nPartitioning Strategy\nData Type\tPartition Key\tWhy\nTransactional/event\tDate (daily or monthly)\tMost queries filter by time range\nMulti-tenant\tTenant ID + date\tIsolate tenant queries, time-range pruning\nGeospatial\tRegion + date\tRegional queries are common\nLog data\tDate + hour\tHigh volume needs finer partitions\nReference/dimension\tDon't partition\tToo small, full scan is fine\n\nRules:\n\nTarget 100MB-1GB per partition (compressed)\n<10,000 total partitions per table\nNever partition on high-cardinality columns (user_id)\nAlways include partition key in WHERE clauses\nPhase 6: Data Governance & Cataloging\nData Classification\nLevel\tExamples\tControls\nPublic\tProduct catalog, published stats\tNo restrictions\nInternal\tAggregated metrics, non-PII analytics\tAuth required, audit logging\nConfidential\tCustomer PII, financial records, HR data\tEncryption, column-level access, masking\nRestricted\tSSN, payment cards, health records, passwords\tEncryption at rest + transit, tokenization, audit every access, retention limits\nPII Handling Rules\nIdentify: Scan all sources for PII columns (name, email, phone, SSN, DOB, address, IP)\nClassify: Tag each with sensitivity level\nMinimize: Only ingest PII you actually need\nProtect:\nHash or tokenize in staging (SHA-256 with salt for pseudonymization)\nDynamic masking for non-privileged users\nColumn-level encryption for restricted data\nRetain: Auto-delete after retention period\nAudit: Log every query touching PII columns\nRight to delete: Build a deletion pipeline that propagates across all derived tables\nData Catalog Entry Template\ndataset:\n  name: \"\"\n  description: \"\"\n  owner_team: \"\"\n  steward: \"\"                  # person responsible for quality\n  domain: \"\"                   # sales | marketing | finance | product | engineering\n  tier: \"\"                     # gold (trusted) | silver (cleaned) | bronze (raw)\n  \n  lineage:\n    sources: []                # upstream datasets/systems\n    transformations: \"\"        # brief description of key transforms\n    downstream: []             # who consumes this\n  \n  refresh:\n    schedule: \"\"\n    sla_hours: 0\n    last_successful_run: \"\"\n  \n  quality:\n    tests: []                  # list of quality checks\n    last_score: 0              # 0-100\n    known_issues: []\n  \n  access:\n    classification: \"\"         # public | internal | confidential | restricted\n    pii_columns: []\n    access_request_process: \"\" # how to get access\n  \n  usage:\n    avg_daily_queries: 0\n    top_consumers: []\n    cost_monthly_usd: 0\n\nPhase 7: Pipeline Monitoring & Alerting\nPipeline Health Dashboard\ndashboard:\n  pipeline_metrics:\n    - metric: \"Pipeline Success Rate\"\n      formula: \"successful_runs / total_runs * 100\"\n      target: \">99%\"\n      alert_threshold: \"<95%\"\n\n    - metric: \"Average Runtime\"\n      formula: \"avg(end_time - start_time) over 7 days\"\n      target: \"<SLA\"\n      alert_threshold: \">80% of SLA\"\n\n    - metric: \"Data Freshness\"\n      formula: \"NOW() - MAX(loaded_at)\"\n      target: \"<SLA hours\"\n      alert_threshold: \">SLA\"\n\n    - metric: \"Data Volume Variance\"\n      formula: \"abs(today_rows - avg_7d_rows) / avg_7d_rows * 100\"\n      target: \"<20%\"\n      alert_threshold: \">50%\"\n\n    - metric: \"Quality Check Pass Rate\"\n      formula: \"passed_checks / total_checks * 100\"\n      target: \"100%\"\n      alert_threshold: \"<95%\"\n\n    - metric: \"Failed Pipeline Count\"\n      formula: \"count where status = failed in last 24h\"\n      target: \"0\"\n      alert_threshold: \">0\"\n\n    - metric: \"Backfill Queue\"\n      formula: \"count of pending backfill requests\"\n      target: \"0\"\n      alert_threshold: \">5\"\n\n    - metric: \"Infrastructure Cost\"\n      formula: \"compute + storage + egress\"\n      target: \"<budget\"\n      alert_threshold: \">110% budget\"\n\nAlert Severity\nSeverity\tCondition\tResponse Time\tExample\nP0\tRevenue/compliance impacting\t15 min\tPayment pipeline down, regulatory report delayed\nP1\tBusiness-critical dashboard stale\t1 hour\tExecutive dashboard >4h stale\nP2\tNon-critical pipeline failed\t4 hours\tMarketing attribution delayed\nP3\tWarning/degradation\tNext business day\tPipeline 80% of SLA, minor quality drift\nStructured Logging Standard\n\nEvery pipeline run MUST log:\n\n{\n  \"pipeline_name\": \"\",\n  \"run_id\": \"\",\n  \"started_at\": \"\",\n  \"completed_at\": \"\",\n  \"status\": \"success|failed|partial\",\n  \"stage\": \"\",\n  \"rows_extracted\": 0,\n  \"rows_transformed\": 0,\n  \"rows_loaded\": 0,\n  \"rows_rejected\": 0,\n  \"quality_checks_passed\": 0,\n  \"quality_checks_failed\": 0,\n  \"duration_seconds\": 0,\n  \"error_message\": \"\",\n  \"watermark_before\": \"\",\n  \"watermark_after\": \"\"\n}\n\nPhase 8: Testing Strategy\nPipeline Test Pyramid\nLayer\tWhat to Test\tHow\tWhen\nUnit\tIndividual transforms, business logic\tpytest with fixtures, dbt unit tests\tEvery PR\nIntegration\tSource connectivity, schema compatibility\tTest against staging/dev environment\tDaily + PR\nContract\tSchema hasn't changed, data types stable\tSchema registry, contract tests\tEvery pipeline run\nData Quality\tCompleteness, uniqueness, freshness, validity\tQuality framework checks\tEvery pipeline run\nE2E\tFull pipeline produces correct output\tGolden dataset comparison\tWeekly + release\nPerformance\tRuntime within SLA, no regression\tBenchmark against historical runs\tWeekly\ndbt Testing Checklist\n# For every model, define at minimum:\nmodels:\n  - name: fct_orders\n    columns:\n      - name: order_id\n        tests:\n          - unique\n          - not_null\n      - name: customer_id\n        tests:\n          - not_null\n          - relationships:\n              to: ref('dim_customers')\n              field: customer_id\n      - name: order_amount\n        tests:\n          - not_null\n          - dbt_utils.accepted_range:\n              min_value: 0\n              max_value: 1000000\n      - name: order_status\n        tests:\n          - accepted_values:\n              values: ['pending', 'confirmed', 'shipped', 'delivered', 'cancelled']\n      - name: ordered_at\n        tests:\n          - not_null\n          - dbt_utils.recency:\n              datepart: day\n              field: ordered_at\n              interval: 2\n\nBackfill Protocol\n\nWhen you need to reprocess historical data:\n\nScope: Define exact date range and affected tables\nImpact assessment: What downstream models/dashboards will be affected?\nCommunication: Notify consumers of temporary data inconsistency\nIsolation: Run backfill in separate compute to avoid impacting current pipelines\nValidation: Compare row counts and key metrics pre/post backfill\nExecution: Process in reverse-chronological order (most recent first)\nMonitoring: Watch for resource spikes, duplicate creation\nVerification: Reconcile against source after completion\nDocumentation: Log what was backfilled, why, and any anomalies found\nPhase 9: Cost Optimization\nCloud Cost Reduction Strategies\nStrategy\tSavings\tEffort\nRight-size compute (auto-scaling)\t20-40%\tLow\nUse spot/preemptible instances for batch\t60-80%\tMedium\nCompress data (Parquet + Snappy/Zstd)\t50-80% storage\tLow\nLifecycle policies (hot → warm → cold → archive)\t40-70% storage\tLow\nEliminate unused tables/pipelines\t10-30%\tLow\nOptimize query patterns (partition pruning)\t30-60% compute\tMedium\nReserved capacity for steady-state\t30-50%\tMedium\nCache expensive queries\t20-50% compute\tMedium\nCost Allocation Template\ncost_tracking:\n  by_pipeline:\n    - pipeline: \"\"\n      compute_monthly_usd: 0\n      storage_monthly_usd: 0\n      egress_monthly_usd: 0\n      total: 0\n      cost_per_row: 0        # total / rows_processed\n      business_value: \"\"     # what revenue/decision does this enable?\n      roi_justified: true    # is the cost worth it?\n\n  optimization_opportunities:\n    - description: \"\"\n      estimated_savings_usd: 0\n      effort: \"\"             # low | medium | high\n      priority: 0            # 1 = do now\n\nCost Red Flags\nSingle pipeline >30% of total spend\nCost per row increasing month-over-month\nTables with 0 queries in 30 days\nDev/staging environments running 24/7\nFull table scans on >1TB tables\nUncompressed data in cloud storage\nCross-region data transfer\nPhase 10: Operational Runbooks\nPipeline Failure Triage\nPipeline failed →\n1. Check error message in logs\n   ├── Connection timeout → Check source availability, network, credentials\n   ├── Schema mismatch → Source schema changed → update extract + notify\n   ├── Data quality check failed → Investigate source data, check for anomalies\n   ├── Out of memory → Increase resources or optimize query\n   ├── Permission denied → Check IAM roles, token expiry\n   ├── Duplicate key violation → Check idempotency, investigate source dupes\n   └── Timeout (SLA breach) → Check data volume spike, query plan, cluster health\n\n2. Determine impact\n   ├── What dashboards/reports are affected?\n   ├── What's the data freshness SLA?\n   └── Who needs to be notified?\n\n3. Fix\n   ├── Transient (network, timeout) → Retry\n   ├── Data issue → Fix source data, re-run with quality gate override if safe\n   ├── Schema change → Update pipeline, backfill if needed\n   └── Infrastructure → Scale up, file ticket with cloud provider\n\n4. Post-fix\n   ├── Verify data correctness\n   ├── Update runbook with new failure mode\n   └── Add monitoring/alerting to catch earlier next time\n\nSchema Change Management\n\nWhen a source system changes schema:\n\nDetect: Schema comparison check in extraction pipeline (hash schema, compare to registered)\nClassify:\nAdditive (new column): Usually safe — add to pipeline, backfill if needed\nRename: Map old → new in transform, update downstream\nType change: Assess compatibility, may need cast or historical rebuild\nColumn removed: Critical — breaks queries, need immediate attention\nTest: Run pipeline in dry-run mode with new schema\nDeploy: Update transforms, quality checks, documentation\nCommunicate: Notify downstream consumers via data contract channel\nDisaster Recovery\nScenario\tRPO\tRTO\tRecovery Steps\nPipeline code lost\t0 (git)\t1h\tRedeploy from git, restore orchestrator state\nWarehouse data corrupted\tVaries\t4h\tRestore from Time Travel/snapshot, re-run affected pipelines\nSource system down\tN/A\tWait\tQueue extractions, catch up with incremental once restored\nCloud region outage\t24h\t8h\tFailover to DR region if configured, else wait\nCredential compromise\t0\t2h\tRotate all credentials, audit access logs, re-run affected pipelines\nPhase 11: Advanced Patterns\nSlowly Changing Dimension Type 2 (SQL Template)\n-- Merge pattern for SCD Type 2\nMERGE INTO dim_customer AS target\nUSING (\n  SELECT * FROM stg_customers\n  WHERE updated_at > (SELECT MAX(valid_from) FROM dim_customer)\n) AS source\nON target.customer_natural_key = source.customer_id\n   AND target.is_current = TRUE\n\n-- Update: close old record\nWHEN MATCHED AND (\n  target.customer_name != source.name OR\n  target.customer_status != source.status\n  -- list all Type 2 tracked columns\n) THEN UPDATE SET\n  is_current = FALSE,\n  valid_to = CURRENT_TIMESTAMP\n\n-- Insert: new record (both new customers and changed ones)\nWHEN NOT MATCHED THEN INSERT (\n  customer_natural_key, customer_name, customer_status,\n  valid_from, valid_to, is_current\n) VALUES (\n  source.customer_id, source.name, source.status,\n  CURRENT_TIMESTAMP, '9999-12-31', TRUE\n);\n\n-- Then insert new versions of changed records\nINSERT INTO dim_customer (\n  customer_natural_key, customer_name, customer_status,\n  valid_from, valid_to, is_current\n)\nSELECT customer_id, name, status,\n  CURRENT_TIMESTAMP, '9999-12-31', TRUE\nFROM stg_customers s\nWHERE EXISTS (\n  SELECT 1 FROM dim_customer d\n  WHERE d.customer_natural_key = s.customer_id\n    AND d.is_current = FALSE\n    AND d.valid_to = CURRENT_TIMESTAMP\n);\n\nCDC with Debezium (Architecture Pattern)\nSource DB → Debezium Connector → Kafka Topic → \n  ├── Stream processor (Flink/Spark Streaming) → Target DB\n  ├── S3 sink connector → Data Lake (raw)\n  └── Elasticsearch sink → Search index\n\n\nKey decisions:\n\nTopic per table or single topic: Per table (easier routing, independent scaling)\nSchema registry: Always use (Confluent Schema Registry or AWS Glue)\nSerialization: Avro (compact + schema evolution) or Protobuf (strict + fast)\nOffset management: Connector manages; monitor consumer lag\nFeature Store Pattern\nfeature_store:\n  entity: \"customer\"\n  entity_key: \"customer_id\"\n  \n  features:\n    - name: \"total_orders_30d\"\n      description: \"Total orders in last 30 days\"\n      type: \"INT\"\n      source: \"fct_orders\"\n      computation: \"batch\"      # batch | streaming | on-demand\n      freshness: \"daily\"\n      ttl_hours: 48\n    \n    - name: \"avg_order_value_90d\"\n      description: \"Average order value last 90 days\"\n      type: \"FLOAT\"\n      source: \"fct_orders\"\n      computation: \"batch\"\n      freshness: \"daily\"\n      ttl_hours: 48\n    \n    - name: \"last_login_minutes_ago\"\n      description: \"Minutes since last login event\"\n      type: \"INT\"\n      source: \"events_stream\"\n      computation: \"streaming\"\n      freshness: \"real-time\"\n      ttl_hours: 1\n  \n  serving:\n    online: true               # low-latency feature serving (Redis/DynamoDB)\n    offline: true              # batch feature retrieval for training\n    point_in_time_correct: true  # prevent feature leakage in ML training\n\nData Mesh Principles\n\nIf operating at scale (>5 data teams):\n\nDomain ownership: Each business domain owns its data products (not central data team)\nData as a product: Treat datasets like products — SLAs, documentation, discoverability\nSelf-serve platform: Central team builds the platform, domains build on top\nFederated governance: Standards and interoperability maintained centrally, implementation decentralized\n\nWhen NOT to use Data Mesh:\n\n<5 data producers/consumers\nSmall team (<20 engineers total)\nSingle business domain\nEarly-stage company (over-engineering)\nQuality Scoring Rubric (0-100)\nDimension\tWeight\tScoring\nPipeline Reliability\t20\t0=frequent failures, 10=some failures with manual recovery, 20=99.5%+ success rate with auto-retry\nData Quality\t20\t0=no checks, 10=basic null/unique checks, 20=comprehensive quality framework with contracts\nPerformance\t15\t0=regularly breaches SLA, 8=meets SLA, 15=well under SLA with optimization\nDocumentation\t10\t0=none, 5=basic README, 10=full catalog entries with lineage and business definitions\nMonitoring\t15\t0=no alerts, 8=failure alerts only, 15=proactive monitoring with dashboards and anomaly detection\nTesting\t10\t0=no tests, 5=basic smoke tests, 10=full test pyramid (unit+integration+contract+E2E)\nCost Efficiency\t10\t0=no cost tracking, 5=tracked, 10=optimized with ROI justification per pipeline\n\nScoring guide:\n\n0-40: Critical gaps — prioritize pipeline reliability and data quality\n41-60: Functional but fragile — add monitoring, testing, documentation\n61-80: Solid — optimize performance, cost, governance\n81-100: Excellent — maintain, innovate, mentor\nEdge Cases & Gotchas\nTimezone Traps\nStore everything in UTC. Convert only at presentation layer\nEvent timestamps: use event time, not processing time\nDaylight saving: TIMESTAMP WITH TIME ZONE, never WITHOUT\nLate-arriving data: watermark strategy + allowed lateness window\nLate-Arriving Data\nDefine maximum acceptable lateness per source\nReprocess affected partitions when late data arrives\nTrack late arrival rate as a quality metric\nConsider separate \"late data\" pipeline that patches in\nExactly-Once Processing\nTrue exactly-once is expensive. Most systems need at-least-once + idempotent writes\nUse transaction IDs or natural keys for deduplication\nKafka: use idempotent producer + transactional consumer\nDatabase: MERGE/UPSERT on natural key\nSchema Evolution\nForward compatible: New code reads old data (safe to deploy new readers first)\nBackward compatible: Old code reads new data (safe to deploy new writers first)\nFull compatible: Both directions (safest, most restrictive)\nUse Avro or Protobuf with schema registry for streaming data\nMulti-Tenant Data\nTenant ID in every table, every query, every log\nRow-level security in warehouse\nSeparate compute per tenant (or at least isolation)\nNever join across tenants without explicit business reason\nTenant-aware backfill (don't rebuild all tenants for one tenant's issue)\nData Lake Anti-Patterns\n\"Data Swamp\": ingesting everything with no organization or catalog → only ingest what has a known consumer\nSmall files: thousands of <1MB files → compact regularly (target 100MB-1GB)\nNo table format: raw Parquet/CSV without Delta/Iceberg → loses ACID, schema evolution, time travel\nNo access controls: single bucket, everyone admin → implement IAM per domain/team\nNatural Language Commands\n\nSay any of these to activate specific workflows:\n\n\"Design a data pipeline for [source] to [target]\" → Full pipeline template with extraction strategy, transforms, load pattern, quality checks\n\"Model [entity/domain] for analytics\" → Dimensional model with fact/dimension tables, grain, measures, SCD types\n\"Optimize this query/pipeline\" → Performance analysis with specific recommendations\n\"Set up data quality for [table/pipeline]\" → Quality framework with checks, contracts, monitoring\n\"Audit our data infrastructure\" → Full assessment using scoring rubric\n\"Help with [Spark/Airflow/dbt/Kafka] issue\" → Troubleshooting with technology-specific guidance\n\"Design a data catalog for our org\" → Catalog template with governance, classification, lineage\n\"Plan a data migration from [old] to [new]\" → Migration plan with validation, rollback, parallel-run\n\"Set up monitoring for our pipelines\" → Dashboard template with alerts, logging standards, runbooks\n\"Review our data costs\" → Cost analysis with optimization strategies and ROI framework\n\"Handle schema change in [source]\" → Change management protocol with impact assessment\n\"Backfill [table] for [date range]\" → Backfill protocol with validation and communication plan"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/1kalin/afrexai-data-engineering",
    "publisherUrl": "https://clawhub.ai/1kalin/afrexai-data-engineering",
    "owner": "1kalin",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/afrexai-data-engineering",
    "downloadUrl": "https://openagent3.xyz/downloads/afrexai-data-engineering",
    "agentUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent",
    "manifestUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/afrexai-data-engineering/agent.md"
  }
}