{
  "schemaVersion": "1.0",
  "item": {
    "slug": "senior-data-engineer",
    "name": "Senior Data Engineer",
    "source": "tencent",
    "type": "skill",
    "category": "AI 智能",
    "sourceUrl": "https://clawhub.ai/alirezarezvani/senior-data-engineer",
    "canonicalUrl": "https://clawhub.ai/alirezarezvani/senior-data-engineer",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/senior-data-engineer",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=senior-data-engineer",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "references/data_modeling_patterns.md",
      "references/data_pipeline_architecture.md",
      "references/dataops_best_practices.md",
      "references/troubleshooting.md",
      "references/workflows.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-30T16:55:25.780Z",
      "expiresAt": "2026-05-07T16:55:25.780Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=network",
        "contentDisposition": "attachment; filename=\"network-1.0.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/senior-data-engineer"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/senior-data-engineer",
    "agentPageUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Senior Data Engineer",
        "body": "Production-grade data engineering skill for building scalable, reliable data systems."
      },
      {
        "title": "Table of Contents",
        "body": "Trigger Phrases\nQuick Start\nWorkflows\n\nBuilding a Batch ETL Pipeline\nImplementing Real-Time Streaming\nData Quality Framework Setup\n\n\nArchitecture Decision Framework\nTech Stack\nReference Documentation\nTroubleshooting"
      },
      {
        "title": "Trigger Phrases",
        "body": "Activate this skill when you see:\n\nPipeline Design:\n\n\"Design a data pipeline for...\"\n\"Build an ETL/ELT process...\"\n\"How should I ingest data from...\"\n\"Set up data extraction from...\"\n\nArchitecture:\n\n\"Should I use batch or streaming?\"\n\"Lambda vs Kappa architecture\"\n\"How to handle late-arriving data\"\n\"Design a data lakehouse\"\n\nData Modeling:\n\n\"Create a dimensional model...\"\n\"Star schema vs snowflake\"\n\"Implement slowly changing dimensions\"\n\"Design a data vault\"\n\nData Quality:\n\n\"Add data validation to...\"\n\"Set up data quality checks\"\n\"Monitor data freshness\"\n\"Implement data contracts\"\n\nPerformance:\n\n\"Optimize this Spark job\"\n\"Query is running slow\"\n\"Reduce pipeline execution time\"\n\"Tune Airflow DAG\""
      },
      {
        "title": "Core Tools",
        "body": "# Generate pipeline orchestration config\npython scripts/pipeline_orchestrator.py generate \\\n  --type airflow \\\n  --source postgres \\\n  --destination snowflake \\\n  --schedule \"0 5 * * *\"\n\n# Validate data quality\npython scripts/data_quality_validator.py validate \\\n  --input data/sales.parquet \\\n  --schema schemas/sales.json \\\n  --checks freshness,completeness,uniqueness\n\n# Optimize ETL performance\npython scripts/etl_performance_optimizer.py analyze \\\n  --query queries/daily_aggregation.sql \\\n  --engine spark \\\n  --recommend"
      },
      {
        "title": "Workflows",
        "body": "→ See references/workflows.md for details"
      },
      {
        "title": "Architecture Decision Framework",
        "body": "Use this framework to choose the right approach for your data pipeline."
      },
      {
        "title": "Batch vs Streaming",
        "body": "CriteriaBatchStreamingLatency requirementHours to daysSeconds to minutesData volumeLarge historical datasetsContinuous event streamsProcessing complexityComplex transformations, MLSimple aggregations, filteringCost sensitivityMore cost-effectiveHigher infrastructure costError handlingEasier to reprocessRequires careful design\n\nDecision Tree:\n\nIs real-time insight required?\n├── Yes → Use streaming\n│   └── Is exactly-once semantics needed?\n│       ├── Yes → Kafka + Flink/Spark Structured Streaming\n│       └── No → Kafka + consumer groups\n└── No → Use batch\n    └── Is data volume > 1TB daily?\n        ├── Yes → Spark/Databricks\n        └── No → dbt + warehouse compute"
      },
      {
        "title": "Lambda vs Kappa Architecture",
        "body": "AspectLambdaKappaComplexityTwo codebases (batch + stream)Single codebaseMaintenanceHigher (sync batch/stream logic)LowerReprocessingNative batch layerReplay from sourceUse caseML training + real-time servingPure event-driven\n\nWhen to choose Lambda:\n\nNeed to train ML models on historical data\nComplex batch transformations not feasible in streaming\nExisting batch infrastructure\n\nWhen to choose Kappa:\n\nEvent-sourced architecture\nAll processing can be expressed as stream operations\nStarting fresh without legacy systems"
      },
      {
        "title": "Data Warehouse vs Data Lakehouse",
        "body": "FeatureWarehouse (Snowflake/BigQuery)Lakehouse (Delta/Iceberg)Best forBI, SQL analyticsML, unstructured dataStorage costHigher (proprietary format)Lower (open formats)FlexibilitySchema-on-writeSchema-on-readPerformanceExcellent for SQLGood, improvingEcosystemMature BI toolsGrowing ML tooling"
      },
      {
        "title": "Tech Stack",
        "body": "CategoryTechnologiesLanguagesPython, SQL, ScalaOrchestrationAirflow, Prefect, DagsterTransformationdbt, Spark, FlinkStreamingKafka, Kinesis, Pub/SubStorageS3, GCS, Delta Lake, IcebergWarehousesSnowflake, BigQuery, Redshift, DatabricksQualityGreat Expectations, dbt tests, Monte CarloMonitoringPrometheus, Grafana, Datadog"
      },
      {
        "title": "1. Data Pipeline Architecture",
        "body": "See references/data_pipeline_architecture.md for:\n\nLambda vs Kappa architecture patterns\nBatch processing with Spark and Airflow\nStream processing with Kafka and Flink\nExactly-once semantics implementation\nError handling and dead letter queues"
      },
      {
        "title": "2. Data Modeling Patterns",
        "body": "See references/data_modeling_patterns.md for:\n\nDimensional modeling (Star/Snowflake)\nSlowly Changing Dimensions (SCD Types 1-6)\nData Vault modeling\ndbt best practices\nPartitioning and clustering"
      },
      {
        "title": "3. DataOps Best Practices",
        "body": "See references/dataops_best_practices.md for:\n\nData testing frameworks\nData contracts and schema validation\nCI/CD for data pipelines\nObservability and lineage\nIncident response"
      },
      {
        "title": "Troubleshooting",
        "body": "→ See references/troubleshooting.md for details"
      }
    ],
    "body": "Senior Data Engineer\n\nProduction-grade data engineering skill for building scalable, reliable data systems.\n\nTable of Contents\nTrigger Phrases\nQuick Start\nWorkflows\nBuilding a Batch ETL Pipeline\nImplementing Real-Time Streaming\nData Quality Framework Setup\nArchitecture Decision Framework\nTech Stack\nReference Documentation\nTroubleshooting\nTrigger Phrases\n\nActivate this skill when you see:\n\nPipeline Design:\n\n\"Design a data pipeline for...\"\n\"Build an ETL/ELT process...\"\n\"How should I ingest data from...\"\n\"Set up data extraction from...\"\n\nArchitecture:\n\n\"Should I use batch or streaming?\"\n\"Lambda vs Kappa architecture\"\n\"How to handle late-arriving data\"\n\"Design a data lakehouse\"\n\nData Modeling:\n\n\"Create a dimensional model...\"\n\"Star schema vs snowflake\"\n\"Implement slowly changing dimensions\"\n\"Design a data vault\"\n\nData Quality:\n\n\"Add data validation to...\"\n\"Set up data quality checks\"\n\"Monitor data freshness\"\n\"Implement data contracts\"\n\nPerformance:\n\n\"Optimize this Spark job\"\n\"Query is running slow\"\n\"Reduce pipeline execution time\"\n\"Tune Airflow DAG\"\nQuick Start\nCore Tools\n# Generate pipeline orchestration config\npython scripts/pipeline_orchestrator.py generate \\\n  --type airflow \\\n  --source postgres \\\n  --destination snowflake \\\n  --schedule \"0 5 * * *\"\n\n# Validate data quality\npython scripts/data_quality_validator.py validate \\\n  --input data/sales.parquet \\\n  --schema schemas/sales.json \\\n  --checks freshness,completeness,uniqueness\n\n# Optimize ETL performance\npython scripts/etl_performance_optimizer.py analyze \\\n  --query queries/daily_aggregation.sql \\\n  --engine spark \\\n  --recommend\n\nWorkflows\n\n→ See references/workflows.md for details\n\nArchitecture Decision Framework\n\nUse this framework to choose the right approach for your data pipeline.\n\nBatch vs Streaming\nCriteria\tBatch\tStreaming\nLatency requirement\tHours to days\tSeconds to minutes\nData volume\tLarge historical datasets\tContinuous event streams\nProcessing complexity\tComplex transformations, ML\tSimple aggregations, filtering\nCost sensitivity\tMore cost-effective\tHigher infrastructure cost\nError handling\tEasier to reprocess\tRequires careful design\n\nDecision Tree:\n\nIs real-time insight required?\n├── Yes → Use streaming\n│   └── Is exactly-once semantics needed?\n│       ├── Yes → Kafka + Flink/Spark Structured Streaming\n│       └── No → Kafka + consumer groups\n└── No → Use batch\n    └── Is data volume > 1TB daily?\n        ├── Yes → Spark/Databricks\n        └── No → dbt + warehouse compute\n\nLambda vs Kappa Architecture\nAspect\tLambda\tKappa\nComplexity\tTwo codebases (batch + stream)\tSingle codebase\nMaintenance\tHigher (sync batch/stream logic)\tLower\nReprocessing\tNative batch layer\tReplay from source\nUse case\tML training + real-time serving\tPure event-driven\n\nWhen to choose Lambda:\n\nNeed to train ML models on historical data\nComplex batch transformations not feasible in streaming\nExisting batch infrastructure\n\nWhen to choose Kappa:\n\nEvent-sourced architecture\nAll processing can be expressed as stream operations\nStarting fresh without legacy systems\nData Warehouse vs Data Lakehouse\nFeature\tWarehouse (Snowflake/BigQuery)\tLakehouse (Delta/Iceberg)\nBest for\tBI, SQL analytics\tML, unstructured data\nStorage cost\tHigher (proprietary format)\tLower (open formats)\nFlexibility\tSchema-on-write\tSchema-on-read\nPerformance\tExcellent for SQL\tGood, improving\nEcosystem\tMature BI tools\tGrowing ML tooling\nTech Stack\nCategory\tTechnologies\nLanguages\tPython, SQL, Scala\nOrchestration\tAirflow, Prefect, Dagster\nTransformation\tdbt, Spark, Flink\nStreaming\tKafka, Kinesis, Pub/Sub\nStorage\tS3, GCS, Delta Lake, Iceberg\nWarehouses\tSnowflake, BigQuery, Redshift, Databricks\nQuality\tGreat Expectations, dbt tests, Monte Carlo\nMonitoring\tPrometheus, Grafana, Datadog\nReference Documentation\n1. Data Pipeline Architecture\n\nSee references/data_pipeline_architecture.md for:\n\nLambda vs Kappa architecture patterns\nBatch processing with Spark and Airflow\nStream processing with Kafka and Flink\nExactly-once semantics implementation\nError handling and dead letter queues\n2. Data Modeling Patterns\n\nSee references/data_modeling_patterns.md for:\n\nDimensional modeling (Star/Snowflake)\nSlowly Changing Dimensions (SCD Types 1-6)\nData Vault modeling\ndbt best practices\nPartitioning and clustering\n3. DataOps Best Practices\n\nSee references/dataops_best_practices.md for:\n\nData testing frameworks\nData contracts and schema validation\nCI/CD for data pipelines\nObservability and lineage\nIncident response\nTroubleshooting\n\n→ See references/troubleshooting.md for details"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/alirezarezvani/senior-data-engineer",
    "publisherUrl": "https://clawhub.ai/alirezarezvani/senior-data-engineer",
    "owner": "alirezarezvani",
    "version": "2.1.1",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/senior-data-engineer",
    "downloadUrl": "https://openagent3.xyz/downloads/senior-data-engineer",
    "agentUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/senior-data-engineer/agent.md"
  }
}