{
  "schemaVersion": "1.0",
  "item": {
    "slug": "data-source-audit",
    "name": "Data Source Audit",
    "source": "tencent",
    "type": "skill",
    "category": "数据分析",
    "sourceUrl": "https://clawhub.ai/datadrivenconstruction/data-source-audit",
    "canonicalUrl": "https://clawhub.ai/datadrivenconstruction/data-source-audit",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/data-source-audit",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-source-audit",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "claw.json",
      "instructions.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/data-source-audit"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/data-source-audit",
    "agentPageUrl": "https://openagent3.xyz/skills/data-source-audit/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-source-audit/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-source-audit/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Overview",
        "body": "Perform comprehensive audits of construction data sources to identify silos, map data flows, assess quality, and plan integration strategies. Essential for digital transformation and data-driven construction initiatives."
      },
      {
        "title": "Business Case",
        "body": "Construction organizations typically have 10-50+ data sources:\n\nProject management systems\nEstimating software\nScheduling tools\nAccounting/ERP systems\nBIM platforms\nDocument management systems\nField apps\nSpreadsheets\n\nNote: This skill is vendor-agnostic and works with any data source. Product names mentioned elsewhere in examples are trademarks of their respective owners.\n\nThis skill helps:\n\nDiscover all data sources\nMap data flows and dependencies\nIdentify integration opportunities\nPrioritize data improvement efforts"
      },
      {
        "title": "Technical Implementation",
        "body": "from dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Set\nfrom enum import Enum\nfrom datetime import datetime\nimport pandas as pd\nimport json\n\nclass DataSourceType(Enum):\n    DATABASE = \"database\"\n    API = \"api\"\n    FILE_SHARE = \"file_share\"\n    CLOUD_APP = \"cloud_app\"\n    SPREADSHEET = \"spreadsheet\"\n    LEGACY_SYSTEM = \"legacy_system\"\n    IOT_SENSOR = \"iot_sensor\"\n    MANUAL_ENTRY = \"manual_entry\"\n\nclass DataDomain(Enum):\n    COST = \"cost\"\n    SCHEDULE = \"schedule\"\n    BIM = \"bim\"\n    DOCUMENT = \"document\"\n    FIELD = \"field\"\n    SAFETY = \"safety\"\n    QUALITY = \"quality\"\n    HR = \"hr\"\n    ACCOUNTING = \"accounting\"\n    PROCUREMENT = \"procurement\"\n\n@dataclass\nclass DataSource:\n    name: str\n    source_type: DataSourceType\n    domains: List[DataDomain]\n    owner: str\n    department: str\n    description: str\n    # Technical details\n    technology: str\n    location: str  # cloud, on-prem, hybrid\n    access_method: str  # API, ODBC, file export, manual\n    # Data characteristics\n    update_frequency: str  # real-time, daily, weekly, monthly, ad-hoc\n    data_volume: str  # small, medium, large\n    retention_period: str\n    # Quality metrics\n    completeness_score: float = 0.0\n    accuracy_score: float = 0.0\n    timeliness_score: float = 0.0\n    # Integration status\n    integrations: List[str] = field(default_factory=list)\n    is_master: bool = False  # Is this the master source for any entity?\n    master_for: List[str] = field(default_factory=list)\n    # Issues\n    known_issues: List[str] = field(default_factory=list)\n    # Metadata\n    last_audit_date: Optional[datetime] = None\n    audit_notes: str = \"\"\n\n@dataclass\nclass DataFlow:\n    source: str\n    target: str\n    flow_type: str  # push, pull, bidirectional, manual\n    frequency: str\n    entities: List[str]  # What data entities flow\n    transformation: str  # none, simple, complex\n    status: str  # active, planned, deprecated\n\n@dataclass\nclass DataSilo:\n    name: str\n    sources: List[str]\n    impact: str  # high, medium, low\n    description: str\n    resolution_options: List[str]\n\nclass DataSourceAuditor:\n    \"\"\"Audit and analyze construction data sources.\"\"\"\n\n    def __init__(self):\n        self.sources: Dict[str, DataSource] = {}\n        self.flows: List[DataFlow] = []\n        self.silos: List[DataSilo] = []\n\n    def add_source(self, source: DataSource):\n        \"\"\"Register a data source.\"\"\"\n        self.sources[source.name] = source\n\n    def add_flow(self, flow: DataFlow):\n        \"\"\"Register a data flow between sources.\"\"\"\n        self.flows.append(flow)\n\n    def discover_sources_from_survey(self, survey_responses: List[Dict]) -> List[DataSource]:\n        \"\"\"Create data sources from survey responses.\"\"\"\n        sources = []\n\n        for response in survey_responses:\n            source = DataSource(\n                name=response['system_name'],\n                source_type=DataSourceType(response['type']),\n                domains=[DataDomain(d) for d in response['domains']],\n                owner=response['owner'],\n                department=response['department'],\n                description=response['description'],\n                technology=response['technology'],\n                location=response['location'],\n                access_method=response['access_method'],\n                update_frequency=response['update_frequency'],\n                data_volume=response['data_volume'],\n                retention_period=response['retention_period'],\n            )\n            sources.append(source)\n            self.add_source(source)\n\n        return sources\n\n    def identify_silos(self) -> List[DataSilo]:\n        \"\"\"Identify data silos based on integration analysis.\"\"\"\n        silos = []\n\n        # Find sources with no integrations\n        isolated_sources = [\n            name for name, source in self.sources.items()\n            if not source.integrations and source.source_type != DataSourceType.MANUAL_ENTRY\n        ]\n\n        if isolated_sources:\n            silos.append(DataSilo(\n                name=\"Isolated Systems\",\n                sources=isolated_sources,\n                impact=\"high\",\n                description=\"Systems with no integrations, requiring manual data transfer\",\n                resolution_options=[\n                    \"Implement API integration\",\n                    \"Set up automated file exports\",\n                    \"Migrate to integrated platform\"\n                ]\n            ))\n\n        # Find duplicate data domains without master\n        domain_sources: Dict[DataDomain, List[str]] = {}\n        for name, source in self.sources.items():\n            for domain in source.domains:\n                if domain not in domain_sources:\n                    domain_sources[domain] = []\n                domain_sources[domain].append(name)\n\n        for domain, sources in domain_sources.items():\n            if len(sources) > 1:\n                # Check if any is designated master\n                masters = [s for s in sources if self.sources[s].is_master]\n                if not masters:\n                    silos.append(DataSilo(\n                        name=f\"No Master for {domain.value}\",\n                        sources=sources,\n                        impact=\"medium\",\n                        description=f\"Multiple sources for {domain.value} data without designated master\",\n                        resolution_options=[\n                            \"Designate master data source\",\n                            \"Implement MDM solution\",\n                            \"Create data reconciliation process\"\n                        ]\n                    ))\n\n        # Find one-way flows that should be bidirectional\n        flow_pairs = {}\n        for flow in self.flows:\n            key = tuple(sorted([flow.source, flow.target]))\n            if key not in flow_pairs:\n                flow_pairs[key] = []\n            flow_pairs[key].append(flow)\n\n        for (s1, s2), flows in flow_pairs.items():\n            if len(flows) == 1 and flows[0].flow_type != 'bidirectional':\n                # Check if bidirectional would make sense\n                s1_domains = set(self.sources[s1].domains)\n                s2_domains = set(self.sources[s2].domains)\n                if s1_domains & s2_domains:  # Overlapping domains\n                    silos.append(DataSilo(\n                        name=f\"One-way flow: {s1} -> {s2}\",\n                        sources=[s1, s2],\n                        impact=\"low\",\n                        description=\"Data flows one direction only between systems with overlapping domains\",\n                        resolution_options=[\n                            \"Evaluate need for bidirectional sync\",\n                            \"Implement change data capture\"\n                        ]\n                    ))\n\n        self.silos = silos\n        return silos\n\n    def assess_source_quality(self, source_name: str, sample_data: pd.DataFrame) -> Dict[str, float]:\n        \"\"\"Assess data quality for a source based on sample data.\"\"\"\n        if source_name not in self.sources:\n            raise ValueError(f\"Unknown source: {source_name}\")\n\n        scores = {}\n\n        # Completeness: % of non-null values\n        completeness = 1 - (sample_data.isnull().sum().sum() / sample_data.size)\n        scores['completeness'] = completeness\n\n        # Uniqueness: % of unique rows (for key columns)\n        if len(sample_data) > 0:\n            uniqueness = len(sample_data.drop_duplicates()) / len(sample_data)\n        else:\n            uniqueness = 1.0\n        scores['uniqueness'] = uniqueness\n\n        # Validity: Basic format checks (simplified)\n        validity_checks = 0\n        total_checks = 0\n\n        for col in sample_data.columns:\n            if 'date' in col.lower():\n                total_checks += 1\n                try:\n                    pd.to_datetime(sample_data[col], errors='raise')\n                    validity_checks += 1\n                except:\n                    pass\n            if 'email' in col.lower():\n                total_checks += 1\n                valid_emails = sample_data[col].str.contains(r'@.*\\.', na=False).sum()\n                if valid_emails / len(sample_data) > 0.9:\n                    validity_checks += 1\n\n        scores['validity'] = validity_checks / total_checks if total_checks > 0 else 1.0\n\n        # Update source with scores\n        self.sources[source_name].completeness_score = scores['completeness']\n        self.sources[source_name].accuracy_score = scores['validity']\n\n        return scores\n\n    def create_data_catalog(self) -> pd.DataFrame:\n        \"\"\"Create a data catalog from all sources.\"\"\"\n        catalog_entries = []\n\n        for name, source in self.sources.items():\n            entry = {\n                'Source Name': name,\n                'Type': source.source_type.value,\n                'Domains': ', '.join(d.value for d in source.domains),\n                'Owner': source.owner,\n                'Department': source.department,\n                'Technology': source.technology,\n                'Location': source.location,\n                'Access Method': source.access_method,\n                'Update Frequency': source.update_frequency,\n                'Data Volume': source.data_volume,\n                'Integrations': len(source.integrations),\n                'Is Master': 'Yes' if source.is_master else 'No',\n                'Quality Score': (source.completeness_score + source.accuracy_score) / 2,\n                'Known Issues': len(source.known_issues),\n            }\n            catalog_entries.append(entry)\n\n        return pd.DataFrame(catalog_entries)\n\n    def generate_integration_matrix(self) -> pd.DataFrame:\n        \"\"\"Generate integration matrix showing connections between sources.\"\"\"\n        source_names = list(self.sources.keys())\n        matrix = pd.DataFrame(\n            index=source_names,\n            columns=source_names,\n            data=''\n        )\n\n        for flow in self.flows:\n            if flow.source in source_names and flow.target in source_names:\n                current = matrix.loc[flow.source, flow.target]\n                symbol = '→' if flow.flow_type == 'push' else '←' if flow.flow_type == 'pull' else '↔'\n                matrix.loc[flow.source, flow.target] = f\"{current}{symbol}\" if current else symbol\n\n        return matrix\n\n    def calculate_integration_score(self) -> Dict[str, float]:\n        \"\"\"Calculate overall integration score and breakdown.\"\"\"\n        if not self.sources:\n            return {'overall': 0.0}\n\n        scores = {}\n\n        # Coverage: % of sources with at least one integration\n        integrated = sum(1 for s in self.sources.values() if s.integrations)\n        scores['coverage'] = integrated / len(self.sources)\n\n        # Master data: % of domains with designated master\n        domains_with_master = set()\n        for source in self.sources.values():\n            if source.is_master:\n                domains_with_master.update(source.master_for)\n\n        all_domains = set()\n        for source in self.sources.values():\n            all_domains.update(d.value for d in source.domains)\n\n        scores['master_data'] = len(domains_with_master) / len(all_domains) if all_domains else 1.0\n\n        # Data quality average\n        quality_scores = [\n            (s.completeness_score + s.accuracy_score) / 2\n            for s in self.sources.values()\n            if s.completeness_score > 0 or s.accuracy_score > 0\n        ]\n        scores['quality'] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0\n\n        # Silo impact\n        high_impact_silos = sum(1 for s in self.silos if s.impact == 'high')\n        scores['silo_risk'] = 1 - (high_impact_silos * 0.2)  # Each high-impact silo reduces score\n\n        # Overall\n        scores['overall'] = (\n            scores['coverage'] * 0.3 +\n            scores['master_data'] * 0.25 +\n            scores['quality'] * 0.25 +\n            scores['silo_risk'] * 0.2\n        )\n\n        return scores\n\n    def generate_audit_report(self) -> str:\n        \"\"\"Generate comprehensive audit report.\"\"\"\n        report = [\"# Data Source Audit Report\", \"\"]\n        report.append(f\"**Audit Date:** {datetime.now().strftime('%Y-%m-%d')}\")\n        report.append(f\"**Total Sources:** {len(self.sources)}\")\n        report.append(f\"**Total Data Flows:** {len(self.flows)}\")\n        report.append(\"\")\n\n        # Integration Score\n        scores = self.calculate_integration_score()\n        report.append(\"## Integration Maturity Score\")\n        report.append(f\"**Overall Score:** {scores['overall']:.1%}\")\n        report.append(f\"- Coverage: {scores['coverage']:.1%}\")\n        report.append(f\"- Master Data: {scores['master_data']:.1%}\")\n        report.append(f\"- Data Quality: {scores['quality']:.1%}\")\n        report.append(f\"- Silo Risk: {scores['silo_risk']:.1%}\")\n        report.append(\"\")\n\n        # Sources by Type\n        report.append(\"## Sources by Type\")\n        by_type = {}\n        for source in self.sources.values():\n            t = source.source_type.value\n            by_type[t] = by_type.get(t, 0) + 1\n        for t, count in sorted(by_type.items(), key=lambda x: -x[1]):\n            report.append(f\"- {t}: {count}\")\n        report.append(\"\")\n\n        # Data Silos\n        report.append(\"## Identified Data Silos\")\n        if self.silos:\n            for silo in self.silos:\n                report.append(f\"\\n### {silo.name}\")\n                report.append(f\"**Impact:** {silo.impact}\")\n                report.append(f\"**Sources:** {', '.join(silo.sources)}\")\n                report.append(f\"**Description:** {silo.description}\")\n                report.append(\"**Resolution Options:**\")\n                for opt in silo.resolution_options:\n                    report.append(f\"- {opt}\")\n        else:\n            report.append(\"No significant data silos identified.\")\n        report.append(\"\")\n\n        # Recommendations\n        report.append(\"## Recommendations\")\n        recommendations = self._generate_recommendations()\n        for i, rec in enumerate(recommendations, 1):\n            report.append(f\"{i}. {rec}\")\n\n        return \"\\n\".join(report)\n\n    def _generate_recommendations(self) -> List[str]:\n        \"\"\"Generate recommendations based on audit findings.\"\"\"\n        recommendations = []\n\n        scores = self.calculate_integration_score()\n\n        if scores['coverage'] < 0.7:\n            recommendations.append(\n                \"Increase integration coverage - over 30% of systems are isolated. \"\n                \"Prioritize connecting high-value data sources.\"\n            )\n\n        if scores['master_data'] < 0.5:\n            recommendations.append(\n                \"Implement Master Data Management - designate authoritative sources \"\n                \"for key entities (projects, vendors, employees, cost codes).\"\n            )\n\n        if scores['quality'] < 0.7:\n            recommendations.append(\n                \"Improve data quality - implement validation rules at data entry points \"\n                \"and automated quality monitoring.\"\n            )\n\n        # Check for spreadsheet dependency\n        spreadsheets = [s for s in self.sources.values()\n                       if s.source_type == DataSourceType.SPREADSHEET]\n        if len(spreadsheets) > 3:\n            recommendations.append(\n                f\"Reduce spreadsheet dependency - {len(spreadsheets)} spreadsheet-based \"\n                \"data sources identified. Migrate critical data to proper databases.\"\n            )\n\n        # Check for legacy systems\n        legacy = [s for s in self.sources.values()\n                 if s.source_type == DataSourceType.LEGACY_SYSTEM]\n        if legacy:\n            recommendations.append(\n                f\"Plan legacy system migration - {len(legacy)} legacy systems identified. \"\n                \"Create modernization roadmap.\"\n            )\n\n        return recommendations"
      },
      {
        "title": "Quick Start",
        "body": "# Initialize auditor\nauditor = DataSourceAuditor()\n\n# Add known sources\nauditor.add_source(DataSource(\n    name=\"Procore\",\n    source_type=DataSourceType.CLOUD_APP,\n    domains=[DataDomain.DOCUMENT, DataDomain.FIELD, DataDomain.SCHEDULE],\n    owner=\"Project Controls\",\n    department=\"Operations\",\n    description=\"Primary project management platform\",\n    technology=\"SaaS\",\n    location=\"cloud\",\n    access_method=\"API\",\n    update_frequency=\"real-time\",\n    data_volume=\"large\",\n    retention_period=\"7 years\",\n    integrations=[\"Sage 300\", \"Primavera P6\"],\n    is_master=True,\n    master_for=[\"projects\", \"documents\"]\n))\n\nauditor.add_source(DataSource(\n    name=\"Sage 300\",\n    source_type=DataSourceType.DATABASE,\n    domains=[DataDomain.COST, DataDomain.ACCOUNTING],\n    owner=\"Finance\",\n    department=\"Accounting\",\n    description=\"ERP and job costing system\",\n    technology=\"SQL Server\",\n    location=\"on-prem\",\n    access_method=\"ODBC\",\n    update_frequency=\"daily\",\n    data_volume=\"medium\",\n    retention_period=\"10 years\",\n    is_master=True,\n    master_for=[\"costs\", \"vendors\", \"invoices\"]\n))\n\n# Add data flows\nauditor.add_flow(DataFlow(\n    source=\"Procore\",\n    target=\"Sage 300\",\n    flow_type=\"push\",\n    frequency=\"daily\",\n    entities=[\"change_orders\", \"budget_changes\"],\n    transformation=\"simple\",\n    status=\"active\"\n))\n\n# Identify silos\nsilos = auditor.identify_silos()\n\n# Generate report\nreport = auditor.generate_audit_report()\nprint(report)\n\n# Create data catalog\ncatalog = auditor.create_data_catalog()\ncatalog.to_excel(\"data_catalog.xlsx\", index=False)"
      },
      {
        "title": "Survey Template",
        "body": "Use this survey to discover data sources across the organization:\n\nSystem Survey:\n  - system_name: \"What is the name of this system?\"\n  - type: \"What type of system is it?\"\n    options: [database, api, file_share, cloud_app, spreadsheet, legacy_system]\n  - domains: \"What types of data does it contain?\"\n    options: [cost, schedule, bim, document, field, safety, quality, hr, accounting]\n  - owner: \"Who is the system owner?\"\n  - department: \"Which department uses this system?\"\n  - technology: \"What technology/platform is it built on?\"\n  - location: \"Where is the system hosted?\"\n    options: [cloud, on-prem, hybrid]\n  - access_method: \"How can data be accessed?\"\n    options: [api, odbc, file_export, manual]\n  - update_frequency: \"How often is data updated?\"\n    options: [real-time, daily, weekly, monthly, ad-hoc]\n  - integrations: \"What other systems does it connect to?\""
      },
      {
        "title": "Resources",
        "body": "DAMA DMBOK: Data Management Body of Knowledge\nData Governance Frameworks: DCAM, EDM Council\nIntegration Patterns: Enterprise Integration Patterns book"
      }
    ],
    "body": "Data Source Audit for Construction\nOverview\n\nPerform comprehensive audits of construction data sources to identify silos, map data flows, assess quality, and plan integration strategies. Essential for digital transformation and data-driven construction initiatives.\n\nBusiness Case\n\nConstruction organizations typically have 10-50+ data sources:\n\nProject management systems\nEstimating software\nScheduling tools\nAccounting/ERP systems\nBIM platforms\nDocument management systems\nField apps\nSpreadsheets\n\nNote: This skill is vendor-agnostic and works with any data source. Product names mentioned elsewhere in examples are trademarks of their respective owners.\n\nThis skill helps:\n\nDiscover all data sources\nMap data flows and dependencies\nIdentify integration opportunities\nPrioritize data improvement efforts\nTechnical Implementation\nfrom dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Set\nfrom enum import Enum\nfrom datetime import datetime\nimport pandas as pd\nimport json\n\nclass DataSourceType(Enum):\n    DATABASE = \"database\"\n    API = \"api\"\n    FILE_SHARE = \"file_share\"\n    CLOUD_APP = \"cloud_app\"\n    SPREADSHEET = \"spreadsheet\"\n    LEGACY_SYSTEM = \"legacy_system\"\n    IOT_SENSOR = \"iot_sensor\"\n    MANUAL_ENTRY = \"manual_entry\"\n\nclass DataDomain(Enum):\n    COST = \"cost\"\n    SCHEDULE = \"schedule\"\n    BIM = \"bim\"\n    DOCUMENT = \"document\"\n    FIELD = \"field\"\n    SAFETY = \"safety\"\n    QUALITY = \"quality\"\n    HR = \"hr\"\n    ACCOUNTING = \"accounting\"\n    PROCUREMENT = \"procurement\"\n\n@dataclass\nclass DataSource:\n    name: str\n    source_type: DataSourceType\n    domains: List[DataDomain]\n    owner: str\n    department: str\n    description: str\n    # Technical details\n    technology: str\n    location: str  # cloud, on-prem, hybrid\n    access_method: str  # API, ODBC, file export, manual\n    # Data characteristics\n    update_frequency: str  # real-time, daily, weekly, monthly, ad-hoc\n    data_volume: str  # small, medium, large\n    retention_period: str\n    # Quality metrics\n    completeness_score: float = 0.0\n    accuracy_score: float = 0.0\n    timeliness_score: float = 0.0\n    # Integration status\n    integrations: List[str] = field(default_factory=list)\n    is_master: bool = False  # Is this the master source for any entity?\n    master_for: List[str] = field(default_factory=list)\n    # Issues\n    known_issues: List[str] = field(default_factory=list)\n    # Metadata\n    last_audit_date: Optional[datetime] = None\n    audit_notes: str = \"\"\n\n@dataclass\nclass DataFlow:\n    source: str\n    target: str\n    flow_type: str  # push, pull, bidirectional, manual\n    frequency: str\n    entities: List[str]  # What data entities flow\n    transformation: str  # none, simple, complex\n    status: str  # active, planned, deprecated\n\n@dataclass\nclass DataSilo:\n    name: str\n    sources: List[str]\n    impact: str  # high, medium, low\n    description: str\n    resolution_options: List[str]\n\nclass DataSourceAuditor:\n    \"\"\"Audit and analyze construction data sources.\"\"\"\n\n    def __init__(self):\n        self.sources: Dict[str, DataSource] = {}\n        self.flows: List[DataFlow] = []\n        self.silos: List[DataSilo] = []\n\n    def add_source(self, source: DataSource):\n        \"\"\"Register a data source.\"\"\"\n        self.sources[source.name] = source\n\n    def add_flow(self, flow: DataFlow):\n        \"\"\"Register a data flow between sources.\"\"\"\n        self.flows.append(flow)\n\n    def discover_sources_from_survey(self, survey_responses: List[Dict]) -> List[DataSource]:\n        \"\"\"Create data sources from survey responses.\"\"\"\n        sources = []\n\n        for response in survey_responses:\n            source = DataSource(\n                name=response['system_name'],\n                source_type=DataSourceType(response['type']),\n                domains=[DataDomain(d) for d in response['domains']],\n                owner=response['owner'],\n                department=response['department'],\n                description=response['description'],\n                technology=response['technology'],\n                location=response['location'],\n                access_method=response['access_method'],\n                update_frequency=response['update_frequency'],\n                data_volume=response['data_volume'],\n                retention_period=response['retention_period'],\n            )\n            sources.append(source)\n            self.add_source(source)\n\n        return sources\n\n    def identify_silos(self) -> List[DataSilo]:\n        \"\"\"Identify data silos based on integration analysis.\"\"\"\n        silos = []\n\n        # Find sources with no integrations\n        isolated_sources = [\n            name for name, source in self.sources.items()\n            if not source.integrations and source.source_type != DataSourceType.MANUAL_ENTRY\n        ]\n\n        if isolated_sources:\n            silos.append(DataSilo(\n                name=\"Isolated Systems\",\n                sources=isolated_sources,\n                impact=\"high\",\n                description=\"Systems with no integrations, requiring manual data transfer\",\n                resolution_options=[\n                    \"Implement API integration\",\n                    \"Set up automated file exports\",\n                    \"Migrate to integrated platform\"\n                ]\n            ))\n\n        # Find duplicate data domains without master\n        domain_sources: Dict[DataDomain, List[str]] = {}\n        for name, source in self.sources.items():\n            for domain in source.domains:\n                if domain not in domain_sources:\n                    domain_sources[domain] = []\n                domain_sources[domain].append(name)\n\n        for domain, sources in domain_sources.items():\n            if len(sources) > 1:\n                # Check if any is designated master\n                masters = [s for s in sources if self.sources[s].is_master]\n                if not masters:\n                    silos.append(DataSilo(\n                        name=f\"No Master for {domain.value}\",\n                        sources=sources,\n                        impact=\"medium\",\n                        description=f\"Multiple sources for {domain.value} data without designated master\",\n                        resolution_options=[\n                            \"Designate master data source\",\n                            \"Implement MDM solution\",\n                            \"Create data reconciliation process\"\n                        ]\n                    ))\n\n        # Find one-way flows that should be bidirectional\n        flow_pairs = {}\n        for flow in self.flows:\n            key = tuple(sorted([flow.source, flow.target]))\n            if key not in flow_pairs:\n                flow_pairs[key] = []\n            flow_pairs[key].append(flow)\n\n        for (s1, s2), flows in flow_pairs.items():\n            if len(flows) == 1 and flows[0].flow_type != 'bidirectional':\n                # Check if bidirectional would make sense\n                s1_domains = set(self.sources[s1].domains)\n                s2_domains = set(self.sources[s2].domains)\n                if s1_domains & s2_domains:  # Overlapping domains\n                    silos.append(DataSilo(\n                        name=f\"One-way flow: {s1} -> {s2}\",\n                        sources=[s1, s2],\n                        impact=\"low\",\n                        description=\"Data flows one direction only between systems with overlapping domains\",\n                        resolution_options=[\n                            \"Evaluate need for bidirectional sync\",\n                            \"Implement change data capture\"\n                        ]\n                    ))\n\n        self.silos = silos\n        return silos\n\n    def assess_source_quality(self, source_name: str, sample_data: pd.DataFrame) -> Dict[str, float]:\n        \"\"\"Assess data quality for a source based on sample data.\"\"\"\n        if source_name not in self.sources:\n            raise ValueError(f\"Unknown source: {source_name}\")\n\n        scores = {}\n\n        # Completeness: % of non-null values\n        completeness = 1 - (sample_data.isnull().sum().sum() / sample_data.size)\n        scores['completeness'] = completeness\n\n        # Uniqueness: % of unique rows (for key columns)\n        if len(sample_data) > 0:\n            uniqueness = len(sample_data.drop_duplicates()) / len(sample_data)\n        else:\n            uniqueness = 1.0\n        scores['uniqueness'] = uniqueness\n\n        # Validity: Basic format checks (simplified)\n        validity_checks = 0\n        total_checks = 0\n\n        for col in sample_data.columns:\n            if 'date' in col.lower():\n                total_checks += 1\n                try:\n                    pd.to_datetime(sample_data[col], errors='raise')\n                    validity_checks += 1\n                except:\n                    pass\n            if 'email' in col.lower():\n                total_checks += 1\n                valid_emails = sample_data[col].str.contains(r'@.*\\.', na=False).sum()\n                if valid_emails / len(sample_data) > 0.9:\n                    validity_checks += 1\n\n        scores['validity'] = validity_checks / total_checks if total_checks > 0 else 1.0\n\n        # Update source with scores\n        self.sources[source_name].completeness_score = scores['completeness']\n        self.sources[source_name].accuracy_score = scores['validity']\n\n        return scores\n\n    def create_data_catalog(self) -> pd.DataFrame:\n        \"\"\"Create a data catalog from all sources.\"\"\"\n        catalog_entries = []\n\n        for name, source in self.sources.items():\n            entry = {\n                'Source Name': name,\n                'Type': source.source_type.value,\n                'Domains': ', '.join(d.value for d in source.domains),\n                'Owner': source.owner,\n                'Department': source.department,\n                'Technology': source.technology,\n                'Location': source.location,\n                'Access Method': source.access_method,\n                'Update Frequency': source.update_frequency,\n                'Data Volume': source.data_volume,\n                'Integrations': len(source.integrations),\n                'Is Master': 'Yes' if source.is_master else 'No',\n                'Quality Score': (source.completeness_score + source.accuracy_score) / 2,\n                'Known Issues': len(source.known_issues),\n            }\n            catalog_entries.append(entry)\n\n        return pd.DataFrame(catalog_entries)\n\n    def generate_integration_matrix(self) -> pd.DataFrame:\n        \"\"\"Generate integration matrix showing connections between sources.\"\"\"\n        source_names = list(self.sources.keys())\n        matrix = pd.DataFrame(\n            index=source_names,\n            columns=source_names,\n            data=''\n        )\n\n        for flow in self.flows:\n            if flow.source in source_names and flow.target in source_names:\n                current = matrix.loc[flow.source, flow.target]\n                symbol = '→' if flow.flow_type == 'push' else '←' if flow.flow_type == 'pull' else '↔'\n                matrix.loc[flow.source, flow.target] = f\"{current}{symbol}\" if current else symbol\n\n        return matrix\n\n    def calculate_integration_score(self) -> Dict[str, float]:\n        \"\"\"Calculate overall integration score and breakdown.\"\"\"\n        if not self.sources:\n            return {'overall': 0.0}\n\n        scores = {}\n\n        # Coverage: % of sources with at least one integration\n        integrated = sum(1 for s in self.sources.values() if s.integrations)\n        scores['coverage'] = integrated / len(self.sources)\n\n        # Master data: % of domains with designated master\n        domains_with_master = set()\n        for source in self.sources.values():\n            if source.is_master:\n                domains_with_master.update(source.master_for)\n\n        all_domains = set()\n        for source in self.sources.values():\n            all_domains.update(d.value for d in source.domains)\n\n        scores['master_data'] = len(domains_with_master) / len(all_domains) if all_domains else 1.0\n\n        # Data quality average\n        quality_scores = [\n            (s.completeness_score + s.accuracy_score) / 2\n            for s in self.sources.values()\n            if s.completeness_score > 0 or s.accuracy_score > 0\n        ]\n        scores['quality'] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0\n\n        # Silo impact\n        high_impact_silos = sum(1 for s in self.silos if s.impact == 'high')\n        scores['silo_risk'] = 1 - (high_impact_silos * 0.2)  # Each high-impact silo reduces score\n\n        # Overall\n        scores['overall'] = (\n            scores['coverage'] * 0.3 +\n            scores['master_data'] * 0.25 +\n            scores['quality'] * 0.25 +\n            scores['silo_risk'] * 0.2\n        )\n\n        return scores\n\n    def generate_audit_report(self) -> str:\n        \"\"\"Generate comprehensive audit report.\"\"\"\n        report = [\"# Data Source Audit Report\", \"\"]\n        report.append(f\"**Audit Date:** {datetime.now().strftime('%Y-%m-%d')}\")\n        report.append(f\"**Total Sources:** {len(self.sources)}\")\n        report.append(f\"**Total Data Flows:** {len(self.flows)}\")\n        report.append(\"\")\n\n        # Integration Score\n        scores = self.calculate_integration_score()\n        report.append(\"## Integration Maturity Score\")\n        report.append(f\"**Overall Score:** {scores['overall']:.1%}\")\n        report.append(f\"- Coverage: {scores['coverage']:.1%}\")\n        report.append(f\"- Master Data: {scores['master_data']:.1%}\")\n        report.append(f\"- Data Quality: {scores['quality']:.1%}\")\n        report.append(f\"- Silo Risk: {scores['silo_risk']:.1%}\")\n        report.append(\"\")\n\n        # Sources by Type\n        report.append(\"## Sources by Type\")\n        by_type = {}\n        for source in self.sources.values():\n            t = source.source_type.value\n            by_type[t] = by_type.get(t, 0) + 1\n        for t, count in sorted(by_type.items(), key=lambda x: -x[1]):\n            report.append(f\"- {t}: {count}\")\n        report.append(\"\")\n\n        # Data Silos\n        report.append(\"## Identified Data Silos\")\n        if self.silos:\n            for silo in self.silos:\n                report.append(f\"\\n### {silo.name}\")\n                report.append(f\"**Impact:** {silo.impact}\")\n                report.append(f\"**Sources:** {', '.join(silo.sources)}\")\n                report.append(f\"**Description:** {silo.description}\")\n                report.append(\"**Resolution Options:**\")\n                for opt in silo.resolution_options:\n                    report.append(f\"- {opt}\")\n        else:\n            report.append(\"No significant data silos identified.\")\n        report.append(\"\")\n\n        # Recommendations\n        report.append(\"## Recommendations\")\n        recommendations = self._generate_recommendations()\n        for i, rec in enumerate(recommendations, 1):\n            report.append(f\"{i}. {rec}\")\n\n        return \"\\n\".join(report)\n\n    def _generate_recommendations(self) -> List[str]:\n        \"\"\"Generate recommendations based on audit findings.\"\"\"\n        recommendations = []\n\n        scores = self.calculate_integration_score()\n\n        if scores['coverage'] < 0.7:\n            recommendations.append(\n                \"Increase integration coverage - over 30% of systems are isolated. \"\n                \"Prioritize connecting high-value data sources.\"\n            )\n\n        if scores['master_data'] < 0.5:\n            recommendations.append(\n                \"Implement Master Data Management - designate authoritative sources \"\n                \"for key entities (projects, vendors, employees, cost codes).\"\n            )\n\n        if scores['quality'] < 0.7:\n            recommendations.append(\n                \"Improve data quality - implement validation rules at data entry points \"\n                \"and automated quality monitoring.\"\n            )\n\n        # Check for spreadsheet dependency\n        spreadsheets = [s for s in self.sources.values()\n                       if s.source_type == DataSourceType.SPREADSHEET]\n        if len(spreadsheets) > 3:\n            recommendations.append(\n                f\"Reduce spreadsheet dependency - {len(spreadsheets)} spreadsheet-based \"\n                \"data sources identified. Migrate critical data to proper databases.\"\n            )\n\n        # Check for legacy systems\n        legacy = [s for s in self.sources.values()\n                 if s.source_type == DataSourceType.LEGACY_SYSTEM]\n        if legacy:\n            recommendations.append(\n                f\"Plan legacy system migration - {len(legacy)} legacy systems identified. \"\n                \"Create modernization roadmap.\"\n            )\n\n        return recommendations\n\nQuick Start\n# Initialize auditor\nauditor = DataSourceAuditor()\n\n# Add known sources\nauditor.add_source(DataSource(\n    name=\"Procore\",\n    source_type=DataSourceType.CLOUD_APP,\n    domains=[DataDomain.DOCUMENT, DataDomain.FIELD, DataDomain.SCHEDULE],\n    owner=\"Project Controls\",\n    department=\"Operations\",\n    description=\"Primary project management platform\",\n    technology=\"SaaS\",\n    location=\"cloud\",\n    access_method=\"API\",\n    update_frequency=\"real-time\",\n    data_volume=\"large\",\n    retention_period=\"7 years\",\n    integrations=[\"Sage 300\", \"Primavera P6\"],\n    is_master=True,\n    master_for=[\"projects\", \"documents\"]\n))\n\nauditor.add_source(DataSource(\n    name=\"Sage 300\",\n    source_type=DataSourceType.DATABASE,\n    domains=[DataDomain.COST, DataDomain.ACCOUNTING],\n    owner=\"Finance\",\n    department=\"Accounting\",\n    description=\"ERP and job costing system\",\n    technology=\"SQL Server\",\n    location=\"on-prem\",\n    access_method=\"ODBC\",\n    update_frequency=\"daily\",\n    data_volume=\"medium\",\n    retention_period=\"10 years\",\n    is_master=True,\n    master_for=[\"costs\", \"vendors\", \"invoices\"]\n))\n\n# Add data flows\nauditor.add_flow(DataFlow(\n    source=\"Procore\",\n    target=\"Sage 300\",\n    flow_type=\"push\",\n    frequency=\"daily\",\n    entities=[\"change_orders\", \"budget_changes\"],\n    transformation=\"simple\",\n    status=\"active\"\n))\n\n# Identify silos\nsilos = auditor.identify_silos()\n\n# Generate report\nreport = auditor.generate_audit_report()\nprint(report)\n\n# Create data catalog\ncatalog = auditor.create_data_catalog()\ncatalog.to_excel(\"data_catalog.xlsx\", index=False)\n\nSurvey Template\n\nUse this survey to discover data sources across the organization:\n\nSystem Survey:\n  - system_name: \"What is the name of this system?\"\n  - type: \"What type of system is it?\"\n    options: [database, api, file_share, cloud_app, spreadsheet, legacy_system]\n  - domains: \"What types of data does it contain?\"\n    options: [cost, schedule, bim, document, field, safety, quality, hr, accounting]\n  - owner: \"Who is the system owner?\"\n  - department: \"Which department uses this system?\"\n  - technology: \"What technology/platform is it built on?\"\n  - location: \"Where is the system hosted?\"\n    options: [cloud, on-prem, hybrid]\n  - access_method: \"How can data be accessed?\"\n    options: [api, odbc, file_export, manual]\n  - update_frequency: \"How often is data updated?\"\n    options: [real-time, daily, weekly, monthly, ad-hoc]\n  - integrations: \"What other systems does it connect to?\"\n\nResources\nDAMA DMBOK: Data Management Body of Knowledge\nData Governance Frameworks: DCAM, EDM Council\nIntegration Patterns: Enterprise Integration Patterns book"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/datadrivenconstruction/data-source-audit",
    "publisherUrl": "https://clawhub.ai/datadrivenconstruction/data-source-audit",
    "owner": "datadrivenconstruction",
    "version": "2.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/data-source-audit",
    "downloadUrl": "https://openagent3.xyz/downloads/data-source-audit",
    "agentUrl": "https://openagent3.xyz/skills/data-source-audit/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-source-audit/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-source-audit/agent.md"
  }
}