{
  "schemaVersion": "1.0",
  "item": {
    "slug": "data-anomaly-detector",
    "name": "Data Anomaly Detector",
    "source": "tencent",
    "type": "skill",
    "category": "数据分析",
    "sourceUrl": "https://clawhub.ai/datadrivenconstruction/data-anomaly-detector",
    "canonicalUrl": "https://clawhub.ai/datadrivenconstruction/data-anomaly-detector",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/data-anomaly-detector",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-anomaly-detector",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "claw.json",
      "instructions.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/data-anomaly-detector"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/data-anomaly-detector",
    "agentPageUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Overview",
        "body": "Detect unusual patterns, outliers, and anomalies in construction data. Identify cost overruns, schedule delays, productivity issues, and data quality problems before they impact projects."
      },
      {
        "title": "Business Case",
        "body": "Construction data often contains anomalies that indicate:\n\nCost estimate errors or fraud\nSchedule logic issues\nProductivity problems\nData entry mistakes\nEquipment or material issues\n\nEarly detection prevents costly corrections and project delays."
      },
      {
        "title": "Technical Implementation",
        "body": "from dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Tuple\nfrom enum import Enum\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime\nfrom scipy import stats\n\nclass AnomalyType(Enum):\n    OUTLIER = \"outlier\"\n    PATTERN_BREAK = \"pattern_break\"\n    MISSING_SEQUENCE = \"missing_sequence\"\n    DUPLICATE = \"duplicate\"\n    IMPOSSIBLE_VALUE = \"impossible_value\"\n    TREND_DEVIATION = \"trend_deviation\"\n\nclass AnomalySeverity(Enum):\n    CRITICAL = \"critical\"\n    HIGH = \"high\"\n    MEDIUM = \"medium\"\n    LOW = \"low\"\n\n@dataclass\nclass Anomaly:\n    id: str\n    anomaly_type: AnomalyType\n    severity: AnomalySeverity\n    field: str\n    value: Any\n    expected_range: Optional[Tuple[float, float]] = None\n    description: str = \"\"\n    row_index: Optional[int] = None\n    detection_method: str = \"\"\n    confidence: float = 0.0\n    suggested_action: str = \"\"\n\n@dataclass\nclass AnomalyReport:\n    source: str\n    detected_at: datetime\n    total_records: int\n    anomalies: List[Anomaly]\n    summary: Dict[str, int]\n\nclass ConstructionAnomalyDetector:\n    \"\"\"Detect anomalies in construction data.\"\"\"\n\n    # Construction-specific thresholds\n    COST_THRESHOLDS = {\n        'concrete_per_cy': (200, 800),\n        'steel_per_ton': (1500, 4000),\n        'labor_per_hour': (25, 150),\n        'overhead_percentage': (5, 25),\n        'contingency_percentage': (3, 20),\n    }\n\n    SCHEDULE_THRESHOLDS = {\n        'max_activity_duration': 365,  # days\n        'max_lag': 30,  # days\n        'min_productivity': 0.1,\n        'max_productivity': 10.0,\n    }\n\n    def __init__(self):\n        self.anomalies: List[Anomaly] = []\n        self.detection_history: List[AnomalyReport] = []\n\n    def detect_cost_anomalies(self, df: pd.DataFrame, cost_column: str,\n                              group_by: str = None) -> List[Anomaly]:\n        \"\"\"Detect anomalies in cost data.\"\"\"\n        anomalies = []\n\n        # Statistical outlier detection (IQR method)\n        Q1 = df[cost_column].quantile(0.25)\n        Q3 = df[cost_column].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n\n        outliers = df[(df[cost_column] < lower_bound) | (df[cost_column] > upper_bound)]\n\n        for idx, row in outliers.iterrows():\n            value = row[cost_column]\n            severity = AnomalySeverity.HIGH if abs(value - df[cost_column].median()) > 3 * IQR else AnomalySeverity.MEDIUM\n\n            anomalies.append(Anomaly(\n                id=f\"COST-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=severity,\n                field=cost_column,\n                value=value,\n                expected_range=(lower_bound, upper_bound),\n                description=f\"Cost value {value:,.2f} outside expected range\",\n                row_index=idx,\n                detection_method=\"IQR\",\n                confidence=0.95,\n                suggested_action=\"Review cost estimate for errors\"\n            ))\n\n        # Negative cost check\n        negatives = df[df[cost_column] < 0]\n        for idx, row in negatives.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"COST-NEG-{idx}\",\n                anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                severity=AnomalySeverity.CRITICAL,\n                field=cost_column,\n                value=row[cost_column],\n                expected_range=(0, None),\n                description=\"Negative cost value detected\",\n                row_index=idx,\n                detection_method=\"Business Rule\",\n                confidence=1.0,\n                suggested_action=\"Correct data entry error or investigate credit\"\n            ))\n\n        # Group-based anomalies (if grouped)\n        if group_by and group_by in df.columns:\n            group_stats = df.groupby(group_by)[cost_column].agg(['mean', 'std'])\n\n            for group_name, stats in group_stats.iterrows():\n                group_data = df[df[group_by] == group_name]\n                z_scores = np.abs((group_data[cost_column] - stats['mean']) / stats['std'])\n\n                for idx, z in z_scores.items():\n                    if z > 3:\n                        anomalies.append(Anomaly(\n                            id=f\"COST-GROUP-{idx}\",\n                            anomaly_type=AnomalyType.OUTLIER,\n                            severity=AnomalySeverity.MEDIUM,\n                            field=cost_column,\n                            value=df.loc[idx, cost_column],\n                            description=f\"Unusual cost for group {group_name} (z-score: {z:.2f})\",\n                            row_index=idx,\n                            detection_method=\"Z-Score by Group\",\n                            confidence=min(z / 5, 1.0)\n                        ))\n\n        return anomalies\n\n    def detect_schedule_anomalies(self, df: pd.DataFrame) -> List[Anomaly]:\n        \"\"\"Detect anomalies in schedule data.\"\"\"\n        anomalies = []\n\n        # Check for required columns\n        required = ['start_date', 'end_date']\n        if not all(col in df.columns for col in required):\n            return anomalies\n\n        # Convert dates\n        df['start_date'] = pd.to_datetime(df['start_date'])\n        df['end_date'] = pd.to_datetime(df['end_date'])\n\n        # Calculate duration\n        df['duration'] = (df['end_date'] - df['start_date']).dt.days\n\n        # Negative duration (end before start)\n        negative_duration = df[df['duration'] < 0]\n        for idx, row in negative_duration.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"SCHED-NEG-{idx}\",\n                anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                severity=AnomalySeverity.CRITICAL,\n                field=\"duration\",\n                value=row['duration'],\n                description=\"End date before start date\",\n                row_index=idx,\n                detection_method=\"Business Rule\",\n                confidence=1.0,\n                suggested_action=\"Correct dates\"\n            ))\n\n        # Extremely long durations\n        long_tasks = df[df['duration'] > self.SCHEDULE_THRESHOLDS['max_activity_duration']]\n        for idx, row in long_tasks.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"SCHED-LONG-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=AnomalySeverity.MEDIUM,\n                field=\"duration\",\n                value=row['duration'],\n                expected_range=(0, self.SCHEDULE_THRESHOLDS['max_activity_duration']),\n                description=f\"Task duration {row['duration']} days exceeds threshold\",\n                row_index=idx,\n                detection_method=\"Threshold\",\n                confidence=0.9,\n                suggested_action=\"Review if task should be broken down\"\n            ))\n\n        # Zero duration non-milestones\n        if 'is_milestone' in df.columns:\n            zero_duration = df[(df['duration'] == 0) & (~df['is_milestone'])]\n            for idx, row in zero_duration.iterrows():\n                anomalies.append(Anomaly(\n                    id=f\"SCHED-ZERO-{idx}\",\n                    anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                    severity=AnomalySeverity.HIGH,\n                    field=\"duration\",\n                    value=0,\n                    description=\"Zero duration task that is not a milestone\",\n                    row_index=idx,\n                    detection_method=\"Business Rule\",\n                    confidence=1.0,\n                    suggested_action=\"Add duration or mark as milestone\"\n                ))\n\n        return anomalies\n\n    def detect_productivity_anomalies(self, df: pd.DataFrame,\n                                      quantity_col: str,\n                                      hours_col: str) -> List[Anomaly]:\n        \"\"\"Detect productivity anomalies.\"\"\"\n        anomalies = []\n\n        # Calculate productivity\n        df['productivity'] = df[quantity_col] / df[hours_col].replace(0, np.nan)\n\n        # Use Modified Z-Score (more robust for skewed data)\n        median = df['productivity'].median()\n        mad = np.abs(df['productivity'] - median).median()\n        modified_z = 0.6745 * (df['productivity'] - median) / mad\n\n        outliers = df[np.abs(modified_z) > 3.5]\n\n        for idx, row in outliers.iterrows():\n            prod = row['productivity']\n            z = modified_z.loc[idx]\n\n            severity = AnomalySeverity.HIGH if abs(z) > 5 else AnomalySeverity.MEDIUM\n            direction = \"high\" if z > 0 else \"low\"\n\n            anomalies.append(Anomaly(\n                id=f\"PROD-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=severity,\n                field=\"productivity\",\n                value=prod,\n                description=f\"Unusually {direction} productivity: {prod:.2f} units/hour\",\n                row_index=idx,\n                detection_method=\"Modified Z-Score\",\n                confidence=min(abs(z) / 7, 1.0),\n                suggested_action=f\"Investigate {direction} productivity cause\"\n            ))\n\n        return anomalies\n\n    def detect_time_series_anomalies(self, df: pd.DataFrame,\n                                      date_col: str,\n                                      value_col: str,\n                                      window: int = 7) -> List[Anomaly]:\n        \"\"\"Detect anomalies in time series data (e.g., daily costs, progress).\"\"\"\n        anomalies = []\n\n        df = df.sort_values(date_col).copy()\n        df['rolling_mean'] = df[value_col].rolling(window=window, center=True).mean()\n        df['rolling_std'] = df[value_col].rolling(window=window, center=True).std()\n\n        # Points outside 2 standard deviations from rolling mean\n        df['z_score'] = (df[value_col] - df['rolling_mean']) / df['rolling_std']\n\n        outliers = df[np.abs(df['z_score']) > 2].dropna()\n\n        for idx, row in outliers.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"TS-{idx}\",\n                anomaly_type=AnomalyType.TREND_DEVIATION,\n                severity=AnomalySeverity.MEDIUM if abs(row['z_score']) < 3 else AnomalySeverity.HIGH,\n                field=value_col,\n                value=row[value_col],\n                expected_range=(\n                    row['rolling_mean'] - 2 * row['rolling_std'],\n                    row['rolling_mean'] + 2 * row['rolling_std']\n                ),\n                description=f\"Value deviates from {window}-day trend\",\n                row_index=idx,\n                detection_method=\"Rolling Z-Score\",\n                confidence=min(abs(row['z_score']) / 4, 1.0)\n            ))\n\n        return anomalies\n\n    def detect_duplicate_anomalies(self, df: pd.DataFrame,\n                                   key_columns: List[str]) -> List[Anomaly]:\n        \"\"\"Detect duplicate records.\"\"\"\n        anomalies = []\n\n        duplicates = df[df.duplicated(subset=key_columns, keep=False)]\n\n        if len(duplicates) > 0:\n            dup_groups = duplicates.groupby(key_columns).size()\n            for keys, count in dup_groups.items():\n                anomalies.append(Anomaly(\n                    id=f\"DUP-{hash(str(keys)) % 10000}\",\n                    anomaly_type=AnomalyType.DUPLICATE,\n                    severity=AnomalySeverity.HIGH,\n                    field=str(key_columns),\n                    value=keys,\n                    description=f\"Found {count} duplicate records for {keys}\",\n                    detection_method=\"Exact Match\",\n                    confidence=1.0,\n                    suggested_action=\"Review and remove duplicates\"\n                ))\n\n        return anomalies\n\n    def detect_sequence_gaps(self, df: pd.DataFrame, sequence_col: str) -> List[Anomaly]:\n        \"\"\"Detect gaps in sequential data (invoice numbers, PO numbers, etc.).\"\"\"\n        anomalies = []\n\n        # Extract numeric part if mixed format\n        df['seq_num'] = pd.to_numeric(\n            df[sequence_col].astype(str).str.extract(r'(\\d+)')[0],\n            errors='coerce'\n        )\n\n        sorted_seq = df['seq_num'].dropna().sort_values()\n        expected = range(int(sorted_seq.min()), int(sorted_seq.max()) + 1)\n        actual = set(sorted_seq.astype(int))\n        missing = set(expected) - actual\n\n        if missing:\n            # Group consecutive missing numbers\n            missing_ranges = []\n            sorted_missing = sorted(missing)\n            start = sorted_missing[0]\n            end = start\n\n            for num in sorted_missing[1:]:\n                if num == end + 1:\n                    end = num\n                else:\n                    missing_ranges.append((start, end))\n                    start = num\n                    end = num\n            missing_ranges.append((start, end))\n\n            for start, end in missing_ranges:\n                range_str = str(start) if start == end else f\"{start}-{end}\"\n                anomalies.append(Anomaly(\n                    id=f\"SEQ-{start}\",\n                    anomaly_type=AnomalyType.MISSING_SEQUENCE,\n                    severity=AnomalySeverity.MEDIUM,\n                    field=sequence_col,\n                    value=range_str,\n                    description=f\"Missing sequence number(s): {range_str}\",\n                    detection_method=\"Sequence Analysis\",\n                    confidence=1.0,\n                    suggested_action=\"Investigate missing numbers\"\n                ))\n\n        return anomalies\n\n    def run_full_detection(self, df: pd.DataFrame, config: Dict) -> AnomalyReport:\n        \"\"\"Run all applicable anomaly detection methods.\"\"\"\n        all_anomalies = []\n\n        # Cost anomalies\n        if 'cost_columns' in config:\n            for col in config['cost_columns']:\n                if col in df.columns:\n                    all_anomalies.extend(\n                        self.detect_cost_anomalies(df, col, config.get('group_by'))\n                    )\n\n        # Schedule anomalies\n        if 'start_date' in df.columns and 'end_date' in df.columns:\n            all_anomalies.extend(self.detect_schedule_anomalies(df))\n\n        # Productivity\n        if 'quantity_col' in config and 'hours_col' in config:\n            all_anomalies.extend(\n                self.detect_productivity_anomalies(\n                    df, config['quantity_col'], config['hours_col']\n                )\n            )\n\n        # Duplicates\n        if 'key_columns' in config:\n            all_anomalies.extend(\n                self.detect_duplicate_anomalies(df, config['key_columns'])\n            )\n\n        # Sequence gaps\n        if 'sequence_column' in config:\n            all_anomalies.extend(\n                self.detect_sequence_gaps(df, config['sequence_column'])\n            )\n\n        # Create summary\n        summary = {}\n        for a in all_anomalies:\n            key = f\"{a.anomaly_type.value}_{a.severity.value}\"\n            summary[key] = summary.get(key, 0) + 1\n\n        report = AnomalyReport(\n            source=config.get('source_name', 'Unknown'),\n            detected_at=datetime.now(),\n            total_records=len(df),\n            anomalies=all_anomalies,\n            summary=summary\n        )\n\n        self.detection_history.append(report)\n        return report\n\n    def generate_report(self, report: AnomalyReport) -> str:\n        \"\"\"Generate markdown anomaly report.\"\"\"\n        lines = [f\"# Anomaly Detection Report\", \"\"]\n        lines.append(f\"**Source:** {report.source}\")\n        lines.append(f\"**Detected At:** {report.detected_at.strftime('%Y-%m-%d %H:%M')}\")\n        lines.append(f\"**Total Records:** {report.total_records:,}\")\n        lines.append(f\"**Anomalies Found:** {len(report.anomalies)}\")\n        lines.append(\"\")\n\n        # Summary by severity\n        lines.append(\"## Summary by Severity\")\n        for severity in AnomalySeverity:\n            count = sum(1 for a in report.anomalies if a.severity == severity)\n            if count > 0:\n                lines.append(f\"- **{severity.value.upper()}:** {count}\")\n        lines.append(\"\")\n\n        # Critical anomalies first\n        critical = [a for a in report.anomalies if a.severity == AnomalySeverity.CRITICAL]\n        if critical:\n            lines.append(\"## Critical Anomalies\")\n            for a in critical:\n                lines.append(f\"\\n### {a.id}\")\n                lines.append(f\"- **Type:** {a.anomaly_type.value}\")\n                lines.append(f\"- **Field:** {a.field}\")\n                lines.append(f\"- **Value:** {a.value}\")\n                lines.append(f\"- **Description:** {a.description}\")\n                lines.append(f\"- **Action:** {a.suggested_action}\")\n\n        # All anomalies table\n        lines.append(\"\\n## All Anomalies\")\n        lines.append(\"| ID | Type | Severity | Field | Description |\")\n        lines.append(\"|-----|------|----------|-------|-------------|\")\n        for a in report.anomalies[:50]:\n            lines.append(f\"| {a.id} | {a.anomaly_type.value} | {a.severity.value} | {a.field} | {a.description[:50]} |\")\n\n        if len(report.anomalies) > 50:\n            lines.append(f\"\\n*... and {len(report.anomalies) - 50} more anomalies*\")\n\n        return \"\\n\".join(lines)"
      },
      {
        "title": "Quick Start",
        "body": "import pandas as pd\n\n# Load data\ndf = pd.read_excel(\"project_costs.xlsx\")\n\n# Initialize detector\ndetector = ConstructionAnomalyDetector()\n\n# Run detection\nconfig = {\n    'source_name': 'Project Costs Q1 2026',\n    'cost_columns': ['total_cost', 'labor_cost', 'material_cost'],\n    'group_by': 'cost_code',\n    'key_columns': ['project_id', 'cost_code', 'date'],\n    'sequence_column': 'invoice_number'\n}\n\nreport = detector.run_full_detection(df, config)\n\n# Generate report\nprint(detector.generate_report(report))\n\n# Get critical anomalies for immediate action\ncritical = [a for a in report.anomalies if a.severity == AnomalySeverity.CRITICAL]\nprint(f\"\\n{len(critical)} critical anomalies require immediate attention\")"
      },
      {
        "title": "Dependencies",
        "body": "pip install pandas numpy scipy"
      },
      {
        "title": "Resources",
        "body": "Statistical Methods: IQR, Z-Score, Modified Z-Score\nConstruction Benchmarks: RSMeans, ENR indices"
      }
    ],
    "body": "Data Anomaly Detector for Construction\nOverview\n\nDetect unusual patterns, outliers, and anomalies in construction data. Identify cost overruns, schedule delays, productivity issues, and data quality problems before they impact projects.\n\nBusiness Case\n\nConstruction data often contains anomalies that indicate:\n\nCost estimate errors or fraud\nSchedule logic issues\nProductivity problems\nData entry mistakes\nEquipment or material issues\n\nEarly detection prevents costly corrections and project delays.\n\nTechnical Implementation\nfrom dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Tuple\nfrom enum import Enum\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime\nfrom scipy import stats\n\nclass AnomalyType(Enum):\n    OUTLIER = \"outlier\"\n    PATTERN_BREAK = \"pattern_break\"\n    MISSING_SEQUENCE = \"missing_sequence\"\n    DUPLICATE = \"duplicate\"\n    IMPOSSIBLE_VALUE = \"impossible_value\"\n    TREND_DEVIATION = \"trend_deviation\"\n\nclass AnomalySeverity(Enum):\n    CRITICAL = \"critical\"\n    HIGH = \"high\"\n    MEDIUM = \"medium\"\n    LOW = \"low\"\n\n@dataclass\nclass Anomaly:\n    id: str\n    anomaly_type: AnomalyType\n    severity: AnomalySeverity\n    field: str\n    value: Any\n    expected_range: Optional[Tuple[float, float]] = None\n    description: str = \"\"\n    row_index: Optional[int] = None\n    detection_method: str = \"\"\n    confidence: float = 0.0\n    suggested_action: str = \"\"\n\n@dataclass\nclass AnomalyReport:\n    source: str\n    detected_at: datetime\n    total_records: int\n    anomalies: List[Anomaly]\n    summary: Dict[str, int]\n\nclass ConstructionAnomalyDetector:\n    \"\"\"Detect anomalies in construction data.\"\"\"\n\n    # Construction-specific thresholds\n    COST_THRESHOLDS = {\n        'concrete_per_cy': (200, 800),\n        'steel_per_ton': (1500, 4000),\n        'labor_per_hour': (25, 150),\n        'overhead_percentage': (5, 25),\n        'contingency_percentage': (3, 20),\n    }\n\n    SCHEDULE_THRESHOLDS = {\n        'max_activity_duration': 365,  # days\n        'max_lag': 30,  # days\n        'min_productivity': 0.1,\n        'max_productivity': 10.0,\n    }\n\n    def __init__(self):\n        self.anomalies: List[Anomaly] = []\n        self.detection_history: List[AnomalyReport] = []\n\n    def detect_cost_anomalies(self, df: pd.DataFrame, cost_column: str,\n                              group_by: str = None) -> List[Anomaly]:\n        \"\"\"Detect anomalies in cost data.\"\"\"\n        anomalies = []\n\n        # Statistical outlier detection (IQR method)\n        Q1 = df[cost_column].quantile(0.25)\n        Q3 = df[cost_column].quantile(0.75)\n        IQR = Q3 - Q1\n        lower_bound = Q1 - 1.5 * IQR\n        upper_bound = Q3 + 1.5 * IQR\n\n        outliers = df[(df[cost_column] < lower_bound) | (df[cost_column] > upper_bound)]\n\n        for idx, row in outliers.iterrows():\n            value = row[cost_column]\n            severity = AnomalySeverity.HIGH if abs(value - df[cost_column].median()) > 3 * IQR else AnomalySeverity.MEDIUM\n\n            anomalies.append(Anomaly(\n                id=f\"COST-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=severity,\n                field=cost_column,\n                value=value,\n                expected_range=(lower_bound, upper_bound),\n                description=f\"Cost value {value:,.2f} outside expected range\",\n                row_index=idx,\n                detection_method=\"IQR\",\n                confidence=0.95,\n                suggested_action=\"Review cost estimate for errors\"\n            ))\n\n        # Negative cost check\n        negatives = df[df[cost_column] < 0]\n        for idx, row in negatives.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"COST-NEG-{idx}\",\n                anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                severity=AnomalySeverity.CRITICAL,\n                field=cost_column,\n                value=row[cost_column],\n                expected_range=(0, None),\n                description=\"Negative cost value detected\",\n                row_index=idx,\n                detection_method=\"Business Rule\",\n                confidence=1.0,\n                suggested_action=\"Correct data entry error or investigate credit\"\n            ))\n\n        # Group-based anomalies (if grouped)\n        if group_by and group_by in df.columns:\n            group_stats = df.groupby(group_by)[cost_column].agg(['mean', 'std'])\n\n            for group_name, stats in group_stats.iterrows():\n                group_data = df[df[group_by] == group_name]\n                z_scores = np.abs((group_data[cost_column] - stats['mean']) / stats['std'])\n\n                for idx, z in z_scores.items():\n                    if z > 3:\n                        anomalies.append(Anomaly(\n                            id=f\"COST-GROUP-{idx}\",\n                            anomaly_type=AnomalyType.OUTLIER,\n                            severity=AnomalySeverity.MEDIUM,\n                            field=cost_column,\n                            value=df.loc[idx, cost_column],\n                            description=f\"Unusual cost for group {group_name} (z-score: {z:.2f})\",\n                            row_index=idx,\n                            detection_method=\"Z-Score by Group\",\n                            confidence=min(z / 5, 1.0)\n                        ))\n\n        return anomalies\n\n    def detect_schedule_anomalies(self, df: pd.DataFrame) -> List[Anomaly]:\n        \"\"\"Detect anomalies in schedule data.\"\"\"\n        anomalies = []\n\n        # Check for required columns\n        required = ['start_date', 'end_date']\n        if not all(col in df.columns for col in required):\n            return anomalies\n\n        # Convert dates\n        df['start_date'] = pd.to_datetime(df['start_date'])\n        df['end_date'] = pd.to_datetime(df['end_date'])\n\n        # Calculate duration\n        df['duration'] = (df['end_date'] - df['start_date']).dt.days\n\n        # Negative duration (end before start)\n        negative_duration = df[df['duration'] < 0]\n        for idx, row in negative_duration.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"SCHED-NEG-{idx}\",\n                anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                severity=AnomalySeverity.CRITICAL,\n                field=\"duration\",\n                value=row['duration'],\n                description=\"End date before start date\",\n                row_index=idx,\n                detection_method=\"Business Rule\",\n                confidence=1.0,\n                suggested_action=\"Correct dates\"\n            ))\n\n        # Extremely long durations\n        long_tasks = df[df['duration'] > self.SCHEDULE_THRESHOLDS['max_activity_duration']]\n        for idx, row in long_tasks.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"SCHED-LONG-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=AnomalySeverity.MEDIUM,\n                field=\"duration\",\n                value=row['duration'],\n                expected_range=(0, self.SCHEDULE_THRESHOLDS['max_activity_duration']),\n                description=f\"Task duration {row['duration']} days exceeds threshold\",\n                row_index=idx,\n                detection_method=\"Threshold\",\n                confidence=0.9,\n                suggested_action=\"Review if task should be broken down\"\n            ))\n\n        # Zero duration non-milestones\n        if 'is_milestone' in df.columns:\n            zero_duration = df[(df['duration'] == 0) & (~df['is_milestone'])]\n            for idx, row in zero_duration.iterrows():\n                anomalies.append(Anomaly(\n                    id=f\"SCHED-ZERO-{idx}\",\n                    anomaly_type=AnomalyType.IMPOSSIBLE_VALUE,\n                    severity=AnomalySeverity.HIGH,\n                    field=\"duration\",\n                    value=0,\n                    description=\"Zero duration task that is not a milestone\",\n                    row_index=idx,\n                    detection_method=\"Business Rule\",\n                    confidence=1.0,\n                    suggested_action=\"Add duration or mark as milestone\"\n                ))\n\n        return anomalies\n\n    def detect_productivity_anomalies(self, df: pd.DataFrame,\n                                      quantity_col: str,\n                                      hours_col: str) -> List[Anomaly]:\n        \"\"\"Detect productivity anomalies.\"\"\"\n        anomalies = []\n\n        # Calculate productivity\n        df['productivity'] = df[quantity_col] / df[hours_col].replace(0, np.nan)\n\n        # Use Modified Z-Score (more robust for skewed data)\n        median = df['productivity'].median()\n        mad = np.abs(df['productivity'] - median).median()\n        modified_z = 0.6745 * (df['productivity'] - median) / mad\n\n        outliers = df[np.abs(modified_z) > 3.5]\n\n        for idx, row in outliers.iterrows():\n            prod = row['productivity']\n            z = modified_z.loc[idx]\n\n            severity = AnomalySeverity.HIGH if abs(z) > 5 else AnomalySeverity.MEDIUM\n            direction = \"high\" if z > 0 else \"low\"\n\n            anomalies.append(Anomaly(\n                id=f\"PROD-{idx}\",\n                anomaly_type=AnomalyType.OUTLIER,\n                severity=severity,\n                field=\"productivity\",\n                value=prod,\n                description=f\"Unusually {direction} productivity: {prod:.2f} units/hour\",\n                row_index=idx,\n                detection_method=\"Modified Z-Score\",\n                confidence=min(abs(z) / 7, 1.0),\n                suggested_action=f\"Investigate {direction} productivity cause\"\n            ))\n\n        return anomalies\n\n    def detect_time_series_anomalies(self, df: pd.DataFrame,\n                                      date_col: str,\n                                      value_col: str,\n                                      window: int = 7) -> List[Anomaly]:\n        \"\"\"Detect anomalies in time series data (e.g., daily costs, progress).\"\"\"\n        anomalies = []\n\n        df = df.sort_values(date_col).copy()\n        df['rolling_mean'] = df[value_col].rolling(window=window, center=True).mean()\n        df['rolling_std'] = df[value_col].rolling(window=window, center=True).std()\n\n        # Points outside 2 standard deviations from rolling mean\n        df['z_score'] = (df[value_col] - df['rolling_mean']) / df['rolling_std']\n\n        outliers = df[np.abs(df['z_score']) > 2].dropna()\n\n        for idx, row in outliers.iterrows():\n            anomalies.append(Anomaly(\n                id=f\"TS-{idx}\",\n                anomaly_type=AnomalyType.TREND_DEVIATION,\n                severity=AnomalySeverity.MEDIUM if abs(row['z_score']) < 3 else AnomalySeverity.HIGH,\n                field=value_col,\n                value=row[value_col],\n                expected_range=(\n                    row['rolling_mean'] - 2 * row['rolling_std'],\n                    row['rolling_mean'] + 2 * row['rolling_std']\n                ),\n                description=f\"Value deviates from {window}-day trend\",\n                row_index=idx,\n                detection_method=\"Rolling Z-Score\",\n                confidence=min(abs(row['z_score']) / 4, 1.0)\n            ))\n\n        return anomalies\n\n    def detect_duplicate_anomalies(self, df: pd.DataFrame,\n                                   key_columns: List[str]) -> List[Anomaly]:\n        \"\"\"Detect duplicate records.\"\"\"\n        anomalies = []\n\n        duplicates = df[df.duplicated(subset=key_columns, keep=False)]\n\n        if len(duplicates) > 0:\n            dup_groups = duplicates.groupby(key_columns).size()\n            for keys, count in dup_groups.items():\n                anomalies.append(Anomaly(\n                    id=f\"DUP-{hash(str(keys)) % 10000}\",\n                    anomaly_type=AnomalyType.DUPLICATE,\n                    severity=AnomalySeverity.HIGH,\n                    field=str(key_columns),\n                    value=keys,\n                    description=f\"Found {count} duplicate records for {keys}\",\n                    detection_method=\"Exact Match\",\n                    confidence=1.0,\n                    suggested_action=\"Review and remove duplicates\"\n                ))\n\n        return anomalies\n\n    def detect_sequence_gaps(self, df: pd.DataFrame, sequence_col: str) -> List[Anomaly]:\n        \"\"\"Detect gaps in sequential data (invoice numbers, PO numbers, etc.).\"\"\"\n        anomalies = []\n\n        # Extract numeric part if mixed format\n        df['seq_num'] = pd.to_numeric(\n            df[sequence_col].astype(str).str.extract(r'(\\d+)')[0],\n            errors='coerce'\n        )\n\n        sorted_seq = df['seq_num'].dropna().sort_values()\n        expected = range(int(sorted_seq.min()), int(sorted_seq.max()) + 1)\n        actual = set(sorted_seq.astype(int))\n        missing = set(expected) - actual\n\n        if missing:\n            # Group consecutive missing numbers\n            missing_ranges = []\n            sorted_missing = sorted(missing)\n            start = sorted_missing[0]\n            end = start\n\n            for num in sorted_missing[1:]:\n                if num == end + 1:\n                    end = num\n                else:\n                    missing_ranges.append((start, end))\n                    start = num\n                    end = num\n            missing_ranges.append((start, end))\n\n            for start, end in missing_ranges:\n                range_str = str(start) if start == end else f\"{start}-{end}\"\n                anomalies.append(Anomaly(\n                    id=f\"SEQ-{start}\",\n                    anomaly_type=AnomalyType.MISSING_SEQUENCE,\n                    severity=AnomalySeverity.MEDIUM,\n                    field=sequence_col,\n                    value=range_str,\n                    description=f\"Missing sequence number(s): {range_str}\",\n                    detection_method=\"Sequence Analysis\",\n                    confidence=1.0,\n                    suggested_action=\"Investigate missing numbers\"\n                ))\n\n        return anomalies\n\n    def run_full_detection(self, df: pd.DataFrame, config: Dict) -> AnomalyReport:\n        \"\"\"Run all applicable anomaly detection methods.\"\"\"\n        all_anomalies = []\n\n        # Cost anomalies\n        if 'cost_columns' in config:\n            for col in config['cost_columns']:\n                if col in df.columns:\n                    all_anomalies.extend(\n                        self.detect_cost_anomalies(df, col, config.get('group_by'))\n                    )\n\n        # Schedule anomalies\n        if 'start_date' in df.columns and 'end_date' in df.columns:\n            all_anomalies.extend(self.detect_schedule_anomalies(df))\n\n        # Productivity\n        if 'quantity_col' in config and 'hours_col' in config:\n            all_anomalies.extend(\n                self.detect_productivity_anomalies(\n                    df, config['quantity_col'], config['hours_col']\n                )\n            )\n\n        # Duplicates\n        if 'key_columns' in config:\n            all_anomalies.extend(\n                self.detect_duplicate_anomalies(df, config['key_columns'])\n            )\n\n        # Sequence gaps\n        if 'sequence_column' in config:\n            all_anomalies.extend(\n                self.detect_sequence_gaps(df, config['sequence_column'])\n            )\n\n        # Create summary\n        summary = {}\n        for a in all_anomalies:\n            key = f\"{a.anomaly_type.value}_{a.severity.value}\"\n            summary[key] = summary.get(key, 0) + 1\n\n        report = AnomalyReport(\n            source=config.get('source_name', 'Unknown'),\n            detected_at=datetime.now(),\n            total_records=len(df),\n            anomalies=all_anomalies,\n            summary=summary\n        )\n\n        self.detection_history.append(report)\n        return report\n\n    def generate_report(self, report: AnomalyReport) -> str:\n        \"\"\"Generate markdown anomaly report.\"\"\"\n        lines = [f\"# Anomaly Detection Report\", \"\"]\n        lines.append(f\"**Source:** {report.source}\")\n        lines.append(f\"**Detected At:** {report.detected_at.strftime('%Y-%m-%d %H:%M')}\")\n        lines.append(f\"**Total Records:** {report.total_records:,}\")\n        lines.append(f\"**Anomalies Found:** {len(report.anomalies)}\")\n        lines.append(\"\")\n\n        # Summary by severity\n        lines.append(\"## Summary by Severity\")\n        for severity in AnomalySeverity:\n            count = sum(1 for a in report.anomalies if a.severity == severity)\n            if count > 0:\n                lines.append(f\"- **{severity.value.upper()}:** {count}\")\n        lines.append(\"\")\n\n        # Critical anomalies first\n        critical = [a for a in report.anomalies if a.severity == AnomalySeverity.CRITICAL]\n        if critical:\n            lines.append(\"## Critical Anomalies\")\n            for a in critical:\n                lines.append(f\"\\n### {a.id}\")\n                lines.append(f\"- **Type:** {a.anomaly_type.value}\")\n                lines.append(f\"- **Field:** {a.field}\")\n                lines.append(f\"- **Value:** {a.value}\")\n                lines.append(f\"- **Description:** {a.description}\")\n                lines.append(f\"- **Action:** {a.suggested_action}\")\n\n        # All anomalies table\n        lines.append(\"\\n## All Anomalies\")\n        lines.append(\"| ID | Type | Severity | Field | Description |\")\n        lines.append(\"|-----|------|----------|-------|-------------|\")\n        for a in report.anomalies[:50]:\n            lines.append(f\"| {a.id} | {a.anomaly_type.value} | {a.severity.value} | {a.field} | {a.description[:50]} |\")\n\n        if len(report.anomalies) > 50:\n            lines.append(f\"\\n*... and {len(report.anomalies) - 50} more anomalies*\")\n\n        return \"\\n\".join(lines)\n\nQuick Start\nimport pandas as pd\n\n# Load data\ndf = pd.read_excel(\"project_costs.xlsx\")\n\n# Initialize detector\ndetector = ConstructionAnomalyDetector()\n\n# Run detection\nconfig = {\n    'source_name': 'Project Costs Q1 2026',\n    'cost_columns': ['total_cost', 'labor_cost', 'material_cost'],\n    'group_by': 'cost_code',\n    'key_columns': ['project_id', 'cost_code', 'date'],\n    'sequence_column': 'invoice_number'\n}\n\nreport = detector.run_full_detection(df, config)\n\n# Generate report\nprint(detector.generate_report(report))\n\n# Get critical anomalies for immediate action\ncritical = [a for a in report.anomalies if a.severity == AnomalySeverity.CRITICAL]\nprint(f\"\\n{len(critical)} critical anomalies require immediate attention\")\n\nDependencies\npip install pandas numpy scipy\n\nResources\nStatistical Methods: IQR, Z-Score, Modified Z-Score\nConstruction Benchmarks: RSMeans, ENR indices"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/datadrivenconstruction/data-anomaly-detector",
    "publisherUrl": "https://clawhub.ai/datadrivenconstruction/data-anomaly-detector",
    "owner": "datadrivenconstruction",
    "version": "2.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/data-anomaly-detector",
    "downloadUrl": "https://openagent3.xyz/downloads/data-anomaly-detector",
    "agentUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-anomaly-detector/agent.md"
  }
}