{
  "schemaVersion": "1.0",
  "item": {
    "slug": "data-analyst",
    "name": "Data Analyst",
    "source": "tencent",
    "type": "skill",
    "category": "数据分析",
    "sourceUrl": "https://clawhub.ai/oyi77/data-analyst",
    "canonicalUrl": "https://clawhub.ai/oyi77/data-analyst",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "manual_only",
    "downloadUrl": "/downloads/data-analyst",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-analyst",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "SKILL.md",
      "scripts/data-init.sh",
      "scripts/query.sh"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Open the source page and confirm the package flow manually.",
      "Review SKILL.md if you can obtain the files.",
      "Treat this source as manual setup until the download is verified."
    ],
    "agentAssist": {
      "summary": "Use the source page and any available docs to guide the install because the item currently does not return a direct package file.",
      "steps": [
        "Open the source page via Open source listing.",
        "If you can obtain the package, extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the source page and extracted files."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I tried to install a skill package from Yavira, but the item currently does not return a direct package file. Inspect the source page and any extracted docs, then tell me what you can confirm and any manual steps still required."
        },
        {
          "label": "Upgrade existing",
          "body": "I tried to upgrade a skill package from Yavira, but the item currently does not return a direct package file. Compare the source page and any extracted docs with my current installation, then summarize what changed and what manual follow-up I still need."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "slug": "data-analyst",
      "status": "source_issue",
      "reason": "not_found",
      "recommendedAction": "review_source",
      "checkedAt": "2026-04-29T07:25:48.984Z",
      "expiresAt": "2026-04-30T07:25:48.984Z",
      "httpStatus": 404,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-analyst",
      "contentType": "text/plain",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-analyst",
        "contentDisposition": null,
        "redirectLocation": null,
        "bodySnippet": null,
        "slug": "data-analyst"
      },
      "scope": "item",
      "summary": "Known item issue.",
      "detail": "This item's current download entry is known to bounce back to a listing or homepage instead of returning a package file.",
      "primaryActionLabel": "Open source listing",
      "primaryActionHref": "https://clawhub.ai/oyi77/data-analyst"
    },
    "validation": {
      "installChecklist": [
        "Open the source listing and confirm there is a real package or setup artifact available.",
        "Review SKILL.md before asking your agent to continue.",
        "Treat this source as manual setup until the upstream download flow is fixed."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/data-analyst",
    "agentPageUrl": "https://openagent3.xyz/skills/data-analyst/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-analyst/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-analyst/agent.md"
  },
  "agentAssist": {
    "summary": "Use the source page and any available docs to guide the install because the item currently does not return a direct package file.",
    "steps": [
      "Open the source page via Open source listing.",
      "If you can obtain the package, extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the source page and extracted files."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I tried to install a skill package from Yavira, but the item currently does not return a direct package file. Inspect the source page and any extracted docs, then tell me what you can confirm and any manual steps still required."
      },
      {
        "label": "Upgrade existing",
        "body": "I tried to upgrade a skill package from Yavira, but the item currently does not return a direct package file. Compare the source page and any extracted docs with my current installation, then summarize what changed and what manual follow-up I still need."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Data Analyst Skill 📊",
        "body": "Turn your AI agent into a data analysis powerhouse.\n\nQuery databases, analyze spreadsheets, create visualizations, and generate insights that drive decisions."
      },
      {
        "title": "What This Skill Does",
        "body": "✅ SQL Queries — Write and execute queries against databases\n✅ Spreadsheet Analysis — Process CSV, Excel, Google Sheets data\n✅ Data Visualization — Create charts, graphs, and dashboards\n✅ Report Generation — Automated reports with insights\n✅ Data Cleaning — Handle missing data, outliers, formatting\n✅ Statistical Analysis — Descriptive stats, trends, correlations"
      },
      {
        "title": "Quick Start",
        "body": "Configure your data sources in TOOLS.md:\n\n### Data Sources\n- Primary DB: [Connection string or description]\n- Spreadsheets: [Google Sheets URL / local path]\n- Data warehouse: [BigQuery/Snowflake/etc.]\n\nSet up your workspace:\n\n./scripts/data-init.sh\n\nStart analyzing!"
      },
      {
        "title": "Common Query Templates",
        "body": "Basic Data Exploration\n\n-- Row count\nSELECT COUNT(*) FROM table_name;\n\n-- Sample data\nSELECT * FROM table_name LIMIT 10;\n\n-- Column statistics\nSELECT \n    column_name,\n    COUNT(*) as count,\n    COUNT(DISTINCT column_name) as unique_values,\n    MIN(column_name) as min_val,\n    MAX(column_name) as max_val\nFROM table_name\nGROUP BY column_name;\n\nTime-Based Analysis\n\n-- Daily aggregation\nSELECT \n    DATE(created_at) as date,\n    COUNT(*) as daily_count,\n    SUM(amount) as daily_total\nFROM transactions\nGROUP BY DATE(created_at)\nORDER BY date DESC;\n\n-- Month-over-month comparison\nSELECT \n    DATE_TRUNC('month', created_at) as month,\n    COUNT(*) as count,\n    LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at)) as prev_month,\n    (COUNT(*) - LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at))) / \n        NULLIF(LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at)), 0) * 100 as growth_pct\nFROM transactions\nGROUP BY DATE_TRUNC('month', created_at)\nORDER BY month;\n\nCohort Analysis\n\n-- User cohort by signup month\nSELECT \n    DATE_TRUNC('month', u.created_at) as cohort_month,\n    DATE_TRUNC('month', o.created_at) as activity_month,\n    COUNT(DISTINCT u.id) as users\nFROM users u\nLEFT JOIN orders o ON u.id = o.user_id\nGROUP BY cohort_month, activity_month\nORDER BY cohort_month, activity_month;\n\nFunnel Analysis\n\n-- Conversion funnel\nWITH funnel AS (\n    SELECT\n        COUNT(DISTINCT CASE WHEN event = 'page_view' THEN user_id END) as views,\n        COUNT(DISTINCT CASE WHEN event = 'signup' THEN user_id END) as signups,\n        COUNT(DISTINCT CASE WHEN event = 'purchase' THEN user_id END) as purchases\n    FROM events\n    WHERE date >= CURRENT_DATE - INTERVAL '30 days'\n)\nSELECT \n    views,\n    signups,\n    ROUND(signups * 100.0 / NULLIF(views, 0), 2) as signup_rate,\n    purchases,\n    ROUND(purchases * 100.0 / NULLIF(signups, 0), 2) as purchase_rate\nFROM funnel;"
      },
      {
        "title": "Common Data Quality Issues",
        "body": "IssueDetectionSolutionMissing valuesIS NULL or empty stringImpute, drop, or flagDuplicatesGROUP BY with HAVING COUNT(*) > 1Deduplicate with rulesOutliersZ-score > 3 or IQR methodInvestigate, cap, or excludeInconsistent formatsSample and pattern matchStandardize with transformsInvalid valuesRange checks, referential integrityValidate and correct"
      },
      {
        "title": "Data Cleaning SQL Patterns",
        "body": "-- Find duplicates\nSELECT email, COUNT(*)\nFROM users\nGROUP BY email\nHAVING COUNT(*) > 1;\n\n-- Find nulls\nSELECT \n    COUNT(*) as total,\n    SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) as null_emails,\n    SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END) as null_names\nFROM users;\n\n-- Standardize text\nUPDATE products\nSET category = LOWER(TRIM(category));\n\n-- Remove outliers (IQR method)\nWITH stats AS (\n    SELECT \n        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY value) as q1,\n        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY value) as q3\n    FROM data\n)\nSELECT * FROM data, stats\nWHERE value BETWEEN q1 - 1.5*(q3-q1) AND q3 + 1.5*(q3-q1);"
      },
      {
        "title": "Data Cleaning Checklist",
        "body": "# Data Quality Audit: [Dataset]\n\n## Row-Level Checks\n- [ ] Total row count: [X]\n- [ ] Duplicate rows: [X]\n- [ ] Rows with any null: [X]\n\n## Column-Level Checks\n| Column | Type | Nulls | Unique | Min | Max | Issues |\n|--------|------|-------|--------|-----|-----|--------|\n| [col] | [type] | [n] | [n] | [v] | [v] | [notes] |\n\n## Data Lineage\n- Source: [Where data came from]\n- Last updated: [Date]\n- Known issues: [List]\n\n## Cleaning Actions Taken\n1. [Action and reason]\n2. [Action and reason]"
      },
      {
        "title": "CSV/Excel Processing with Python",
        "body": "import pandas as pd\n\n# Load data\ndf = pd.read_csv('data.csv')  # or pd.read_excel('data.xlsx')\n\n# Basic exploration\nprint(df.shape)  # (rows, columns)\nprint(df.info())  # Column types and nulls\nprint(df.describe())  # Numeric statistics\n\n# Data cleaning\ndf = df.drop_duplicates()\ndf['date'] = pd.to_datetime(df['date'])\ndf['amount'] = df['amount'].fillna(0)\n\n# Analysis\nsummary = df.groupby('category').agg({\n    'amount': ['sum', 'mean', 'count'],\n    'quantity': 'sum'\n}).round(2)\n\n# Export\nsummary.to_csv('analysis_output.csv')"
      },
      {
        "title": "Common Pandas Operations",
        "body": "# Filtering\nfiltered = df[df['status'] == 'active']\nfiltered = df[df['amount'] > 1000]\nfiltered = df[df['date'].between('2024-01-01', '2024-12-31')]\n\n# Aggregation\nby_category = df.groupby('category')['amount'].sum()\npivot = df.pivot_table(values='amount', index='month', columns='category', aggfunc='sum')\n\n# Window functions\ndf['running_total'] = df['amount'].cumsum()\ndf['pct_change'] = df['amount'].pct_change()\ndf['rolling_avg'] = df['amount'].rolling(window=7).mean()\n\n# Merging\nmerged = pd.merge(df1, df2, on='id', how='left')"
      },
      {
        "title": "Chart Selection Guide",
        "body": "Data TypeBest ChartUse WhenTrend over timeLine chartShowing patterns/changes over timeCategory comparisonBar chartComparing discrete categoriesPart of wholePie/DonutShowing proportions (≤5 categories)DistributionHistogramUnderstanding data spreadCorrelationScatter plotRelationship between two variablesMany categoriesHorizontal barRanking or comparing many itemsGeographicMapLocation-based data"
      },
      {
        "title": "Python Visualization with Matplotlib/Seaborn",
        "body": "import matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Set style\nplt.style.use('seaborn-v0_8-whitegrid')\nsns.set_palette(\"husl\")\n\n# Line chart (trends)\nplt.figure(figsize=(10, 6))\nplt.plot(df['date'], df['value'], marker='o')\nplt.title('Trend Over Time')\nplt.xlabel('Date')\nplt.ylabel('Value')\nplt.xticks(rotation=45)\nplt.tight_layout()\nplt.savefig('trend.png', dpi=150)\n\n# Bar chart (comparisons)\nplt.figure(figsize=(10, 6))\nsns.barplot(data=df, x='category', y='amount')\nplt.title('Amount by Category')\nplt.xticks(rotation=45)\nplt.tight_layout()\nplt.savefig('comparison.png', dpi=150)\n\n# Heatmap (correlations)\nplt.figure(figsize=(10, 8))\nsns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)\nplt.title('Correlation Matrix')\nplt.tight_layout()\nplt.savefig('correlation.png', dpi=150)"
      },
      {
        "title": "ASCII Charts (Quick Terminal Visualization)",
        "body": "When you can't generate images, use ASCII:\n\nRevenue by Month (in $K)\n========================\nJan: ████████████████ 160\nFeb: ██████████████████ 180\nMar: ████████████████████████ 240\nApr: ██████████████████████ 220\nMay: ██████████████████████████ 260\nJun: ████████████████████████████ 280"
      },
      {
        "title": "Standard Report Template",
        "body": "# [Report Name]\n**Period:** [Date range]\n**Generated:** [Date]\n**Author:** [Agent/Human]\n\n## Executive Summary\n[2-3 sentences with key findings]\n\n## Key Metrics\n\n| Metric | Current | Previous | Change |\n|--------|---------|----------|--------|\n| [Metric] | [Value] | [Value] | [+/-X%] |\n\n## Detailed Analysis\n\n### [Section 1]\n[Analysis with supporting data]\n\n### [Section 2]\n[Analysis with supporting data]\n\n## Visualizations\n[Insert charts]\n\n## Insights\n1. **[Insight]**: [Supporting evidence]\n2. **[Insight]**: [Supporting evidence]\n\n## Recommendations\n1. [Actionable recommendation]\n2. [Actionable recommendation]\n\n## Methodology\n- Data source: [Source]\n- Date range: [Range]\n- Filters applied: [Filters]\n- Known limitations: [Limitations]\n\n## Appendix\n[Supporting data tables]"
      },
      {
        "title": "Automated Report Script",
        "body": "#!/bin/bash\n# generate-report.sh\n\n# Pull latest data\npython scripts/extract_data.py --output data/latest.csv\n\n# Run analysis\npython scripts/analyze.py --input data/latest.csv --output reports/\n\n# Generate report\npython scripts/format_report.py --template weekly --output reports/weekly-$(date +%Y-%m-%d).md\n\necho \"Report generated: reports/weekly-$(date +%Y-%m-%d).md\""
      },
      {
        "title": "Descriptive Statistics",
        "body": "StatisticWhat It Tells YouUse CaseMeanAverage valueCentral tendencyMedianMiddle valueRobust to outliersModeMost commonCategorical dataStd DevSpread around meanVariabilityMin/MaxRangeData boundariesPercentilesDistribution shapeBenchmarking"
      },
      {
        "title": "Quick Stats with Python",
        "body": "# Full descriptive statistics\nstats = df['amount'].describe()\nprint(stats)\n\n# Additional stats\nprint(f\"Median: {df['amount'].median()}\")\nprint(f\"Mode: {df['amount'].mode()[0]}\")\nprint(f\"Skewness: {df['amount'].skew()}\")\nprint(f\"Kurtosis: {df['amount'].kurtosis()}\")\n\n# Correlation\ncorrelation = df['sales'].corr(df['marketing_spend'])\nprint(f\"Correlation: {correlation:.3f}\")"
      },
      {
        "title": "Statistical Tests Quick Reference",
        "body": "TestUse CasePythonT-testCompare two meansscipy.stats.ttest_ind(a, b)Chi-squareCategorical independencescipy.stats.chi2_contingency(table)ANOVACompare 3+ meansscipy.stats.f_oneway(a, b, c)PearsonLinear correlationscipy.stats.pearsonr(x, y)"
      },
      {
        "title": "Standard Analysis Process",
        "body": "Define the Question\n\nWhat are we trying to answer?\nWhat decisions will this inform?\n\n\n\nUnderstand the Data\n\nWhat data is available?\nWhat's the structure and quality?\n\n\n\nClean and Prepare\n\nHandle missing values\nFix data types\nRemove duplicates\n\n\n\nExplore\n\nDescriptive statistics\nInitial visualizations\nIdentify patterns\n\n\n\nAnalyze\n\nDeep dive into findings\nStatistical tests if needed\nValidate hypotheses\n\n\n\nCommunicate\n\nClear visualizations\nActionable insights\nRecommendations"
      },
      {
        "title": "Analysis Request Template",
        "body": "# Analysis Request\n\n## Question\n[What are we trying to answer?]\n\n## Context\n[Why does this matter? What decision will it inform?]\n\n## Data Available\n- [Dataset 1]: [Description]\n- [Dataset 2]: [Description]\n\n## Expected Output\n- [Deliverable 1]\n- [Deliverable 2]\n\n## Timeline\n[When is this needed?]\n\n## Notes\n[Any constraints or considerations]"
      },
      {
        "title": "data-init.sh",
        "body": "Initialize your data analysis workspace."
      },
      {
        "title": "query.sh",
        "body": "Quick SQL query execution.\n\n# Run query from file\n./scripts/query.sh --file queries/daily-report.sql\n\n# Run inline query\n./scripts/query.sh \"SELECT COUNT(*) FROM users\"\n\n# Save output to file\n./scripts/query.sh --file queries/export.sql --output data/export.csv"
      },
      {
        "title": "analyze.py",
        "body": "Python analysis toolkit.\n\n# Basic analysis\npython scripts/analyze.py --input data/sales.csv\n\n# With specific analysis type\npython scripts/analyze.py --input data/sales.csv --type cohort\n\n# Generate report\npython scripts/analyze.py --input data/sales.csv --report weekly"
      },
      {
        "title": "With Other Skills",
        "body": "SkillIntegrationMarketingAnalyze campaign performance, content metricsSalesPipeline analytics, conversion analysisBusiness DevMarket research data, competitor analysis"
      },
      {
        "title": "Common Data Sources",
        "body": "Databases: PostgreSQL, MySQL, SQLite\nWarehouses: BigQuery, Snowflake, Redshift\nSpreadsheets: Google Sheets, Excel, CSV\nAPIs: REST endpoints, GraphQL\nFiles: JSON, Parquet, XML"
      },
      {
        "title": "Best Practices",
        "body": "Start with the question — Know what you're trying to answer\nValidate your data — Garbage in = garbage out\nDocument everything — Queries, assumptions, decisions\nVisualize appropriately — Right chart for right data\nShow your work — Methodology matters\nLead with insights — Not just data dumps\nMake it actionable — \"So what?\" → \"Now what?\"\nVersion your queries — Track changes over time"
      },
      {
        "title": "Common Mistakes",
        "body": "❌ Confirmation bias — Looking for data to support a conclusion\n❌ Correlation ≠ causation — Be careful with claims\n❌ Cherry-picking — Using only favorable data\n❌ Ignoring outliers — Investigate before removing\n❌ Over-complicating — Simple analysis often wins\n❌ No context — Numbers without comparison are meaningless"
      },
      {
        "title": "License",
        "body": "License: MIT — use freely, modify, distribute.\n\n\"The goal is to turn data into information, and information into insight.\" — Carly Fiorina"
      }
    ],
    "body": "Data Analyst Skill 📊\n\nTurn your AI agent into a data analysis powerhouse.\n\nQuery databases, analyze spreadsheets, create visualizations, and generate insights that drive decisions.\n\nWhat This Skill Does\n\n✅ SQL Queries — Write and execute queries against databases ✅ Spreadsheet Analysis — Process CSV, Excel, Google Sheets data ✅ Data Visualization — Create charts, graphs, and dashboards ✅ Report Generation — Automated reports with insights ✅ Data Cleaning — Handle missing data, outliers, formatting ✅ Statistical Analysis — Descriptive stats, trends, correlations\n\nQuick Start\nConfigure your data sources in TOOLS.md:\n### Data Sources\n- Primary DB: [Connection string or description]\n- Spreadsheets: [Google Sheets URL / local path]\n- Data warehouse: [BigQuery/Snowflake/etc.]\n\nSet up your workspace:\n./scripts/data-init.sh\n\nStart analyzing!\nSQL Query Patterns\nCommon Query Templates\n\nBasic Data Exploration\n\n-- Row count\nSELECT COUNT(*) FROM table_name;\n\n-- Sample data\nSELECT * FROM table_name LIMIT 10;\n\n-- Column statistics\nSELECT \n    column_name,\n    COUNT(*) as count,\n    COUNT(DISTINCT column_name) as unique_values,\n    MIN(column_name) as min_val,\n    MAX(column_name) as max_val\nFROM table_name\nGROUP BY column_name;\n\n\nTime-Based Analysis\n\n-- Daily aggregation\nSELECT \n    DATE(created_at) as date,\n    COUNT(*) as daily_count,\n    SUM(amount) as daily_total\nFROM transactions\nGROUP BY DATE(created_at)\nORDER BY date DESC;\n\n-- Month-over-month comparison\nSELECT \n    DATE_TRUNC('month', created_at) as month,\n    COUNT(*) as count,\n    LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at)) as prev_month,\n    (COUNT(*) - LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at))) / \n        NULLIF(LAG(COUNT(*)) OVER (ORDER BY DATE_TRUNC('month', created_at)), 0) * 100 as growth_pct\nFROM transactions\nGROUP BY DATE_TRUNC('month', created_at)\nORDER BY month;\n\n\nCohort Analysis\n\n-- User cohort by signup month\nSELECT \n    DATE_TRUNC('month', u.created_at) as cohort_month,\n    DATE_TRUNC('month', o.created_at) as activity_month,\n    COUNT(DISTINCT u.id) as users\nFROM users u\nLEFT JOIN orders o ON u.id = o.user_id\nGROUP BY cohort_month, activity_month\nORDER BY cohort_month, activity_month;\n\n\nFunnel Analysis\n\n-- Conversion funnel\nWITH funnel AS (\n    SELECT\n        COUNT(DISTINCT CASE WHEN event = 'page_view' THEN user_id END) as views,\n        COUNT(DISTINCT CASE WHEN event = 'signup' THEN user_id END) as signups,\n        COUNT(DISTINCT CASE WHEN event = 'purchase' THEN user_id END) as purchases\n    FROM events\n    WHERE date >= CURRENT_DATE - INTERVAL '30 days'\n)\nSELECT \n    views,\n    signups,\n    ROUND(signups * 100.0 / NULLIF(views, 0), 2) as signup_rate,\n    purchases,\n    ROUND(purchases * 100.0 / NULLIF(signups, 0), 2) as purchase_rate\nFROM funnel;\n\nData Cleaning\nCommon Data Quality Issues\nIssue\tDetection\tSolution\nMissing values\tIS NULL or empty string\tImpute, drop, or flag\nDuplicates\tGROUP BY with HAVING COUNT(*) > 1\tDeduplicate with rules\nOutliers\tZ-score > 3 or IQR method\tInvestigate, cap, or exclude\nInconsistent formats\tSample and pattern match\tStandardize with transforms\nInvalid values\tRange checks, referential integrity\tValidate and correct\nData Cleaning SQL Patterns\n-- Find duplicates\nSELECT email, COUNT(*)\nFROM users\nGROUP BY email\nHAVING COUNT(*) > 1;\n\n-- Find nulls\nSELECT \n    COUNT(*) as total,\n    SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) as null_emails,\n    SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END) as null_names\nFROM users;\n\n-- Standardize text\nUPDATE products\nSET category = LOWER(TRIM(category));\n\n-- Remove outliers (IQR method)\nWITH stats AS (\n    SELECT \n        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY value) as q1,\n        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY value) as q3\n    FROM data\n)\nSELECT * FROM data, stats\nWHERE value BETWEEN q1 - 1.5*(q3-q1) AND q3 + 1.5*(q3-q1);\n\nData Cleaning Checklist\n# Data Quality Audit: [Dataset]\n\n## Row-Level Checks\n- [ ] Total row count: [X]\n- [ ] Duplicate rows: [X]\n- [ ] Rows with any null: [X]\n\n## Column-Level Checks\n| Column | Type | Nulls | Unique | Min | Max | Issues |\n|--------|------|-------|--------|-----|-----|--------|\n| [col] | [type] | [n] | [n] | [v] | [v] | [notes] |\n\n## Data Lineage\n- Source: [Where data came from]\n- Last updated: [Date]\n- Known issues: [List]\n\n## Cleaning Actions Taken\n1. [Action and reason]\n2. [Action and reason]\n\nSpreadsheet Analysis\nCSV/Excel Processing with Python\nimport pandas as pd\n\n# Load data\ndf = pd.read_csv('data.csv')  # or pd.read_excel('data.xlsx')\n\n# Basic exploration\nprint(df.shape)  # (rows, columns)\nprint(df.info())  # Column types and nulls\nprint(df.describe())  # Numeric statistics\n\n# Data cleaning\ndf = df.drop_duplicates()\ndf['date'] = pd.to_datetime(df['date'])\ndf['amount'] = df['amount'].fillna(0)\n\n# Analysis\nsummary = df.groupby('category').agg({\n    'amount': ['sum', 'mean', 'count'],\n    'quantity': 'sum'\n}).round(2)\n\n# Export\nsummary.to_csv('analysis_output.csv')\n\nCommon Pandas Operations\n# Filtering\nfiltered = df[df['status'] == 'active']\nfiltered = df[df['amount'] > 1000]\nfiltered = df[df['date'].between('2024-01-01', '2024-12-31')]\n\n# Aggregation\nby_category = df.groupby('category')['amount'].sum()\npivot = df.pivot_table(values='amount', index='month', columns='category', aggfunc='sum')\n\n# Window functions\ndf['running_total'] = df['amount'].cumsum()\ndf['pct_change'] = df['amount'].pct_change()\ndf['rolling_avg'] = df['amount'].rolling(window=7).mean()\n\n# Merging\nmerged = pd.merge(df1, df2, on='id', how='left')\n\nData Visualization\nChart Selection Guide\nData Type\tBest Chart\tUse When\nTrend over time\tLine chart\tShowing patterns/changes over time\nCategory comparison\tBar chart\tComparing discrete categories\nPart of whole\tPie/Donut\tShowing proportions (≤5 categories)\nDistribution\tHistogram\tUnderstanding data spread\nCorrelation\tScatter plot\tRelationship between two variables\nMany categories\tHorizontal bar\tRanking or comparing many items\nGeographic\tMap\tLocation-based data\nPython Visualization with Matplotlib/Seaborn\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Set style\nplt.style.use('seaborn-v0_8-whitegrid')\nsns.set_palette(\"husl\")\n\n# Line chart (trends)\nplt.figure(figsize=(10, 6))\nplt.plot(df['date'], df['value'], marker='o')\nplt.title('Trend Over Time')\nplt.xlabel('Date')\nplt.ylabel('Value')\nplt.xticks(rotation=45)\nplt.tight_layout()\nplt.savefig('trend.png', dpi=150)\n\n# Bar chart (comparisons)\nplt.figure(figsize=(10, 6))\nsns.barplot(data=df, x='category', y='amount')\nplt.title('Amount by Category')\nplt.xticks(rotation=45)\nplt.tight_layout()\nplt.savefig('comparison.png', dpi=150)\n\n# Heatmap (correlations)\nplt.figure(figsize=(10, 8))\nsns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)\nplt.title('Correlation Matrix')\nplt.tight_layout()\nplt.savefig('correlation.png', dpi=150)\n\nASCII Charts (Quick Terminal Visualization)\n\nWhen you can't generate images, use ASCII:\n\nRevenue by Month (in $K)\n========================\nJan: ████████████████ 160\nFeb: ██████████████████ 180\nMar: ████████████████████████ 240\nApr: ██████████████████████ 220\nMay: ██████████████████████████ 260\nJun: ████████████████████████████ 280\n\nReport Generation\nStandard Report Template\n# [Report Name]\n**Period:** [Date range]\n**Generated:** [Date]\n**Author:** [Agent/Human]\n\n## Executive Summary\n[2-3 sentences with key findings]\n\n## Key Metrics\n\n| Metric | Current | Previous | Change |\n|--------|---------|----------|--------|\n| [Metric] | [Value] | [Value] | [+/-X%] |\n\n## Detailed Analysis\n\n### [Section 1]\n[Analysis with supporting data]\n\n### [Section 2]\n[Analysis with supporting data]\n\n## Visualizations\n[Insert charts]\n\n## Insights\n1. **[Insight]**: [Supporting evidence]\n2. **[Insight]**: [Supporting evidence]\n\n## Recommendations\n1. [Actionable recommendation]\n2. [Actionable recommendation]\n\n## Methodology\n- Data source: [Source]\n- Date range: [Range]\n- Filters applied: [Filters]\n- Known limitations: [Limitations]\n\n## Appendix\n[Supporting data tables]\n\nAutomated Report Script\n#!/bin/bash\n# generate-report.sh\n\n# Pull latest data\npython scripts/extract_data.py --output data/latest.csv\n\n# Run analysis\npython scripts/analyze.py --input data/latest.csv --output reports/\n\n# Generate report\npython scripts/format_report.py --template weekly --output reports/weekly-$(date +%Y-%m-%d).md\n\necho \"Report generated: reports/weekly-$(date +%Y-%m-%d).md\"\n\nStatistical Analysis\nDescriptive Statistics\nStatistic\tWhat It Tells You\tUse Case\nMean\tAverage value\tCentral tendency\nMedian\tMiddle value\tRobust to outliers\nMode\tMost common\tCategorical data\nStd Dev\tSpread around mean\tVariability\nMin/Max\tRange\tData boundaries\nPercentiles\tDistribution shape\tBenchmarking\nQuick Stats with Python\n# Full descriptive statistics\nstats = df['amount'].describe()\nprint(stats)\n\n# Additional stats\nprint(f\"Median: {df['amount'].median()}\")\nprint(f\"Mode: {df['amount'].mode()[0]}\")\nprint(f\"Skewness: {df['amount'].skew()}\")\nprint(f\"Kurtosis: {df['amount'].kurtosis()}\")\n\n# Correlation\ncorrelation = df['sales'].corr(df['marketing_spend'])\nprint(f\"Correlation: {correlation:.3f}\")\n\nStatistical Tests Quick Reference\nTest\tUse Case\tPython\nT-test\tCompare two means\tscipy.stats.ttest_ind(a, b)\nChi-square\tCategorical independence\tscipy.stats.chi2_contingency(table)\nANOVA\tCompare 3+ means\tscipy.stats.f_oneway(a, b, c)\nPearson\tLinear correlation\tscipy.stats.pearsonr(x, y)\nAnalysis Workflow\nStandard Analysis Process\n\nDefine the Question\n\nWhat are we trying to answer?\nWhat decisions will this inform?\n\nUnderstand the Data\n\nWhat data is available?\nWhat's the structure and quality?\n\nClean and Prepare\n\nHandle missing values\nFix data types\nRemove duplicates\n\nExplore\n\nDescriptive statistics\nInitial visualizations\nIdentify patterns\n\nAnalyze\n\nDeep dive into findings\nStatistical tests if needed\nValidate hypotheses\n\nCommunicate\n\nClear visualizations\nActionable insights\nRecommendations\nAnalysis Request Template\n# Analysis Request\n\n## Question\n[What are we trying to answer?]\n\n## Context\n[Why does this matter? What decision will it inform?]\n\n## Data Available\n- [Dataset 1]: [Description]\n- [Dataset 2]: [Description]\n\n## Expected Output\n- [Deliverable 1]\n- [Deliverable 2]\n\n## Timeline\n[When is this needed?]\n\n## Notes\n[Any constraints or considerations]\n\nScripts\ndata-init.sh\n\nInitialize your data analysis workspace.\n\nquery.sh\n\nQuick SQL query execution.\n\n# Run query from file\n./scripts/query.sh --file queries/daily-report.sql\n\n# Run inline query\n./scripts/query.sh \"SELECT COUNT(*) FROM users\"\n\n# Save output to file\n./scripts/query.sh --file queries/export.sql --output data/export.csv\n\nanalyze.py\n\nPython analysis toolkit.\n\n# Basic analysis\npython scripts/analyze.py --input data/sales.csv\n\n# With specific analysis type\npython scripts/analyze.py --input data/sales.csv --type cohort\n\n# Generate report\npython scripts/analyze.py --input data/sales.csv --report weekly\n\nIntegration Tips\nWith Other Skills\nSkill\tIntegration\nMarketing\tAnalyze campaign performance, content metrics\nSales\tPipeline analytics, conversion analysis\nBusiness Dev\tMarket research data, competitor analysis\nCommon Data Sources\nDatabases: PostgreSQL, MySQL, SQLite\nWarehouses: BigQuery, Snowflake, Redshift\nSpreadsheets: Google Sheets, Excel, CSV\nAPIs: REST endpoints, GraphQL\nFiles: JSON, Parquet, XML\nBest Practices\nStart with the question — Know what you're trying to answer\nValidate your data — Garbage in = garbage out\nDocument everything — Queries, assumptions, decisions\nVisualize appropriately — Right chart for right data\nShow your work — Methodology matters\nLead with insights — Not just data dumps\nMake it actionable — \"So what?\" → \"Now what?\"\nVersion your queries — Track changes over time\nCommon Mistakes\n\n❌ Confirmation bias — Looking for data to support a conclusion ❌ Correlation ≠ causation — Be careful with claims ❌ Cherry-picking — Using only favorable data ❌ Ignoring outliers — Investigate before removing ❌ Over-complicating — Simple analysis often wins ❌ No context — Numbers without comparison are meaningless\n\nLicense\n\nLicense: MIT — use freely, modify, distribute.\n\n\"The goal is to turn data into information, and information into insight.\" — Carly Fiorina"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/oyi77/data-analyst",
    "publisherUrl": "https://clawhub.ai/oyi77/data-analyst",
    "owner": "oyi77",
    "version": "1.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/data-analyst",
    "downloadUrl": "https://openagent3.xyz/downloads/data-analyst",
    "agentUrl": "https://openagent3.xyz/skills/data-analyst/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-analyst/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-analyst/agent.md"
  }
}