{
  "schemaVersion": "1.0",
  "item": {
    "slug": "data-type-classifier",
    "name": "Data Type Classifier",
    "source": "tencent",
    "type": "skill",
    "category": "数据分析",
    "sourceUrl": "https://clawhub.ai/datadrivenconstruction/data-type-classifier",
    "canonicalUrl": "https://clawhub.ai/datadrivenconstruction/data-type-classifier",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/data-type-classifier",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-type-classifier",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "claw.json",
      "instructions.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "slug": "data-type-classifier",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-29T08:49:14.066Z",
      "expiresAt": "2026-05-06T08:49:14.066Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-type-classifier",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=data-type-classifier",
        "contentDisposition": "attachment; filename=\"data-type-classifier-2.1.0.zip\"",
        "redirectLocation": null,
        "bodySnippet": null,
        "slug": "data-type-classifier"
      },
      "scope": "item",
      "summary": "Item download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this item.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/data-type-classifier"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/data-type-classifier",
    "agentPageUrl": "https://openagent3.xyz/skills/data-type-classifier/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-type-classifier/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-type-classifier/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Overview",
        "body": "Based on DDC methodology (Chapter 2.1), this skill classifies construction data by type, analyzes data sources, and recommends appropriate storage, processing, and integration methods.\n\nBook Reference: \"Типы данных в строительстве\" / \"Data Types in Construction\""
      },
      {
        "title": "Quick Start",
        "body": "from dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import List, Dict, Optional, Any, Tuple\nfrom datetime import datetime\nimport json\nimport re\nimport mimetypes\n\nclass DataStructure(Enum):\n    \"\"\"Data structure classification\"\"\"\n    STRUCTURED = \"structured\"           # Tables, databases, spreadsheets\n    SEMI_STRUCTURED = \"semi_structured\" # JSON, XML, IFC\n    UNSTRUCTURED = \"unstructured\"       # Documents, images, videos\n    GEOMETRIC = \"geometric\"             # CAD, BIM geometry\n    TEMPORAL = \"temporal\"               # Time-series, schedules\n    SPATIAL = \"spatial\"                 # GIS, coordinates\n\nclass DataFormat(Enum):\n    \"\"\"Common construction data formats\"\"\"\n    # Structured\n    CSV = \"csv\"\n    EXCEL = \"excel\"\n    SQL = \"sql\"\n    PARQUET = \"parquet\"\n\n    # Semi-structured\n    JSON = \"json\"\n    XML = \"xml\"\n    IFC = \"ifc\"\n    BCF = \"bcf\"\n\n    # Unstructured\n    PDF = \"pdf\"\n    DOCX = \"docx\"\n    IMAGE = \"image\"\n    VIDEO = \"video\"\n\n    # Geometric\n    DWG = \"dwg\"\n    DXF = \"dxf\"\n    RVT = \"rvt\"\n    NWD = \"nwd\"\n    OBJ = \"obj\"\n    STL = \"stl\"\n\n    # Schedule\n    MPP = \"mpp\"\n    P6 = \"p6\"\n    XER = \"xer\"\n\nclass StorageRecommendation(Enum):\n    \"\"\"Storage system recommendations\"\"\"\n    RELATIONAL_DB = \"relational_database\"\n    DOCUMENT_DB = \"document_database\"\n    OBJECT_STORAGE = \"object_storage\"\n    GRAPH_DB = \"graph_database\"\n    TIME_SERIES_DB = \"time_series_database\"\n    VECTOR_DB = \"vector_database\"\n    FILE_SYSTEM = \"file_system\"\n    DATA_LAKE = \"data_lake\"\n\n@dataclass\nclass DataCharacteristics:\n    \"\"\"Characteristics of a data source\"\"\"\n    has_schema: bool\n    has_relationships: bool\n    is_queryable: bool\n    is_binary: bool\n    has_geometry: bool\n    has_temporal: bool\n    has_text_content: bool\n    avg_record_size: Optional[int] = None  # bytes\n    estimated_volume: Optional[str] = None  # small/medium/large/huge\n    update_frequency: Optional[str] = None\n\n@dataclass\nclass DataClassification:\n    \"\"\"Classification result for a data source\"\"\"\n    source_name: str\n    source_type: str\n    detected_format: DataFormat\n    structure: DataStructure\n    characteristics: DataCharacteristics\n    storage_recommendation: StorageRecommendation\n    processing_tools: List[str]\n    integration_options: List[str]\n    quality_considerations: List[str]\n    confidence: float\n\n@dataclass\nclass ClassificationReport:\n    \"\"\"Complete classification report\"\"\"\n    total_sources: int\n    classifications: List[DataClassification]\n    summary_by_structure: Dict[str, int]\n    summary_by_format: Dict[str, int]\n    storage_recommendations: Dict[str, List[str]]\n    integration_strategy: Dict[str, str]\n\n\nclass DataTypeClassifier:\n    \"\"\"\n    Classify construction data by type and recommend processing methods.\n    Based on DDC methodology Chapter 2.1.\n    \"\"\"\n\n    def __init__(self):\n        self.format_signatures = self._define_format_signatures()\n        self.structure_mapping = self._define_structure_mapping()\n        self.storage_mapping = self._define_storage_mapping()\n        self.processing_tools = self._define_processing_tools()\n\n    def _define_format_signatures(self) -> Dict[str, Dict]:\n        \"\"\"Define format detection signatures\"\"\"\n        return {\n            # File extensions\n            \".csv\": {\"format\": DataFormat.CSV, \"structure\": DataStructure.STRUCTURED},\n            \".xlsx\": {\"format\": DataFormat.EXCEL, \"structure\": DataStructure.STRUCTURED},\n            \".xls\": {\"format\": DataFormat.EXCEL, \"structure\": DataStructure.STRUCTURED},\n            \".json\": {\"format\": DataFormat.JSON, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".xml\": {\"format\": DataFormat.XML, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".ifc\": {\"format\": DataFormat.IFC, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".bcf\": {\"format\": DataFormat.BCF, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".pdf\": {\"format\": DataFormat.PDF, \"structure\": DataStructure.UNSTRUCTURED},\n            \".docx\": {\"format\": DataFormat.DOCX, \"structure\": DataStructure.UNSTRUCTURED},\n            \".dwg\": {\"format\": DataFormat.DWG, \"structure\": DataStructure.GEOMETRIC},\n            \".dxf\": {\"format\": DataFormat.DXF, \"structure\": DataStructure.GEOMETRIC},\n            \".rvt\": {\"format\": DataFormat.RVT, \"structure\": DataStructure.GEOMETRIC},\n            \".nwd\": {\"format\": DataFormat.NWD, \"structure\": DataStructure.GEOMETRIC},\n            \".mpp\": {\"format\": DataFormat.MPP, \"structure\": DataStructure.TEMPORAL},\n            \".xer\": {\"format\": DataFormat.XER, \"structure\": DataStructure.TEMPORAL},\n            \".parquet\": {\"format\": DataFormat.PARQUET, \"structure\": DataStructure.STRUCTURED},\n            \".jpg\": {\"format\": DataFormat.IMAGE, \"structure\": DataStructure.UNSTRUCTURED},\n            \".png\": {\"format\": DataFormat.IMAGE, \"structure\": DataStructure.UNSTRUCTURED},\n            \".mp4\": {\"format\": DataFormat.VIDEO, \"structure\": DataStructure.UNSTRUCTURED}\n        }\n\n    def _define_structure_mapping(self) -> Dict[DataStructure, Dict]:\n        \"\"\"Define characteristics for each structure type\"\"\"\n        return {\n            DataStructure.STRUCTURED: {\n                \"description\": \"Tabular data with fixed schema\",\n                \"examples\": [\"Cost databases\", \"Material lists\", \"Vendor records\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.SEMI_STRUCTURED: {\n                \"description\": \"Hierarchical data with flexible schema\",\n                \"examples\": [\"BIM models (IFC)\", \"API responses\", \"Configuration files\"],\n                \"query_support\": True,\n                \"schema_required\": False\n            },\n            DataStructure.UNSTRUCTURED: {\n                \"description\": \"No predefined schema or format\",\n                \"examples\": [\"Contracts\", \"Photos\", \"Emails\", \"Meeting notes\"],\n                \"query_support\": False,\n                \"schema_required\": False\n            },\n            DataStructure.GEOMETRIC: {\n                \"description\": \"3D/2D geometric and spatial data\",\n                \"examples\": [\"CAD drawings\", \"BIM geometry\", \"Point clouds\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.TEMPORAL: {\n                \"description\": \"Time-based sequential data\",\n                \"examples\": [\"Schedules\", \"Progress data\", \"Sensor readings\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.SPATIAL: {\n                \"description\": \"Geographic and location data\",\n                \"examples\": [\"Site maps\", \"GPS tracks\", \"GIS layers\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            }\n        }\n\n    def _define_storage_mapping(self) -> Dict[DataStructure, StorageRecommendation]:\n        \"\"\"Map data structures to storage recommendations\"\"\"\n        return {\n            DataStructure.STRUCTURED: StorageRecommendation.RELATIONAL_DB,\n            DataStructure.SEMI_STRUCTURED: StorageRecommendation.DOCUMENT_DB,\n            DataStructure.UNSTRUCTURED: StorageRecommendation.OBJECT_STORAGE,\n            DataStructure.GEOMETRIC: StorageRecommendation.FILE_SYSTEM,\n            DataStructure.TEMPORAL: StorageRecommendation.TIME_SERIES_DB,\n            DataStructure.SPATIAL: StorageRecommendation.RELATIONAL_DB\n        }\n\n    def _define_processing_tools(self) -> Dict[DataFormat, List[str]]:\n        \"\"\"Define processing tools for each format\"\"\"\n        return {\n            DataFormat.CSV: [\"pandas\", \"polars\", \"duckdb\"],\n            DataFormat.EXCEL: [\"pandas\", \"openpyxl\", \"xlrd\"],\n            DataFormat.JSON: [\"json\", \"pandas\", \"jq\"],\n            DataFormat.XML: [\"lxml\", \"ElementTree\", \"BeautifulSoup\"],\n            DataFormat.IFC: [\"ifcopenshell\", \"IfcOpenShell\", \"xBIM\"],\n            DataFormat.BCF: [\"bcfpython\", \"ifcopenshell\"],\n            DataFormat.PDF: [\"pdfplumber\", \"PyPDF2\", \"pdf2image\"],\n            DataFormat.DOCX: [\"python-docx\", \"mammoth\"],\n            DataFormat.DWG: [\"ezdxf\", \"Teigha\", \"ODA SDK\"],\n            DataFormat.DXF: [\"ezdxf\", \"dxfgrabber\"],\n            DataFormat.RVT: [\"Revit API\", \"pyRevit\", \"Dynamo\"],\n            DataFormat.NWD: [\"Navisworks API\", \"NW API\"],\n            DataFormat.MPP: [\"mpxj\", \"Project API\"],\n            DataFormat.XER: [\"xerparser\", \"P6 API\"],\n            DataFormat.PARQUET: [\"pandas\", \"pyarrow\", \"polars\"],\n            DataFormat.IMAGE: [\"PIL\", \"opencv\", \"scikit-image\"],\n            DataFormat.VIDEO: [\"opencv\", \"ffmpeg\", \"moviepy\"]\n        }\n\n    def classify_source(\n        self,\n        source_name: str,\n        source_type: str,\n        file_extension: Optional[str] = None,\n        sample_data: Optional[Any] = None,\n        metadata: Optional[Dict] = None\n    ) -> DataClassification:\n        \"\"\"\n        Classify a single data source.\n\n        Args:\n            source_name: Name of the data source\n            source_type: Type (file, database, api, etc.)\n            file_extension: File extension if applicable\n            sample_data: Sample of the data for analysis\n            metadata: Additional metadata\n\n        Returns:\n            Classification result\n        \"\"\"\n        # Detect format\n        detected_format, structure = self._detect_format(\n            file_extension, source_type, sample_data\n        )\n\n        # Analyze characteristics\n        characteristics = self._analyze_characteristics(\n            detected_format, structure, sample_data, metadata\n        )\n\n        # Determine storage recommendation\n        storage = self._recommend_storage(structure, characteristics)\n\n        # Get processing tools\n        tools = self.processing_tools.get(detected_format, [])\n\n        # Determine integration options\n        integration = self._get_integration_options(detected_format, structure)\n\n        # Quality considerations\n        quality = self._get_quality_considerations(detected_format, structure)\n\n        # Calculate confidence\n        confidence = self._calculate_confidence(\n            file_extension, sample_data, metadata\n        )\n\n        return DataClassification(\n            source_name=source_name,\n            source_type=source_type,\n            detected_format=detected_format,\n            structure=structure,\n            characteristics=characteristics,\n            storage_recommendation=storage,\n            processing_tools=tools,\n            integration_options=integration,\n            quality_considerations=quality,\n            confidence=confidence\n        )\n\n    def _detect_format(\n        self,\n        extension: Optional[str],\n        source_type: str,\n        sample: Optional[Any]\n    ) -> Tuple[DataFormat, DataStructure]:\n        \"\"\"Detect data format and structure\"\"\"\n        # Check file extension\n        if extension:\n            ext = extension.lower() if extension.startswith('.') else f\".{extension.lower()}\"\n            if ext in self.format_signatures:\n                sig = self.format_signatures[ext]\n                return sig[\"format\"], sig[\"structure\"]\n\n        # Check source type\n        if source_type == \"database\":\n            return DataFormat.SQL, DataStructure.STRUCTURED\n        elif source_type == \"api\":\n            return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n        # Analyze sample data\n        if sample:\n            if isinstance(sample, dict):\n                return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n            elif isinstance(sample, list) and all(isinstance(x, dict) for x in sample):\n                return DataFormat.JSON, DataStructure.STRUCTURED\n            elif isinstance(sample, str):\n                if sample.strip().startswith('<'):\n                    return DataFormat.XML, DataStructure.SEMI_STRUCTURED\n                elif sample.strip().startswith('{'):\n                    return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n        # Default\n        return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n    def _analyze_characteristics(\n        self,\n        format: DataFormat,\n        structure: DataStructure,\n        sample: Optional[Any],\n        metadata: Optional[Dict]\n    ) -> DataCharacteristics:\n        \"\"\"Analyze data characteristics\"\"\"\n        return DataCharacteristics(\n            has_schema=structure in [DataStructure.STRUCTURED, DataStructure.TEMPORAL],\n            has_relationships=format in [DataFormat.IFC, DataFormat.SQL],\n            is_queryable=structure != DataStructure.UNSTRUCTURED,\n            is_binary=format in [\n                DataFormat.DWG, DataFormat.RVT, DataFormat.NWD,\n                DataFormat.IMAGE, DataFormat.VIDEO, DataFormat.PDF\n            ],\n            has_geometry=structure == DataStructure.GEOMETRIC or format == DataFormat.IFC,\n            has_temporal=structure == DataStructure.TEMPORAL,\n            has_text_content=format in [\n                DataFormat.PDF, DataFormat.DOCX, DataFormat.CSV\n            ],\n            estimated_volume=metadata.get(\"volume\") if metadata else None,\n            update_frequency=metadata.get(\"update_frequency\") if metadata else None\n        )\n\n    def _recommend_storage(\n        self,\n        structure: DataStructure,\n        characteristics: DataCharacteristics\n    ) -> StorageRecommendation:\n        \"\"\"Recommend storage solution\"\"\"\n        # Special cases\n        if characteristics.has_text_content and not characteristics.has_schema:\n            return StorageRecommendation.VECTOR_DB\n\n        if characteristics.is_binary and characteristics.estimated_volume == \"huge\":\n            return StorageRecommendation.OBJECT_STORAGE\n\n        if characteristics.has_relationships:\n            return StorageRecommendation.GRAPH_DB\n\n        # Default mapping\n        return self.storage_mapping.get(structure, StorageRecommendation.FILE_SYSTEM)\n\n    def _get_integration_options(\n        self,\n        format: DataFormat,\n        structure: DataStructure\n    ) -> List[str]:\n        \"\"\"Get integration options for the data\"\"\"\n        options = []\n\n        if structure == DataStructure.STRUCTURED:\n            options.extend([\"Direct SQL queries\", \"ETL pipelines\", \"API export\"])\n        elif structure == DataStructure.SEMI_STRUCTURED:\n            options.extend([\"JSON/XML parsing\", \"Schema validation\", \"API integration\"])\n        elif structure == DataStructure.UNSTRUCTURED:\n            options.extend([\"OCR extraction\", \"NLP processing\", \"ML classification\"])\n        elif structure == DataStructure.GEOMETRIC:\n            options.extend([\"IFC export\", \"Geometry extraction\", \"Clash detection\"])\n\n        # Format-specific options\n        if format == DataFormat.IFC:\n            options.append(\"IFC import/export via IfcOpenShell\")\n        elif format == DataFormat.EXCEL:\n            options.append(\"Pandas DataFrame conversion\")\n        elif format == DataFormat.PDF:\n            options.append(\"PDF text/table extraction\")\n\n        return options\n\n    def _get_quality_considerations(\n        self,\n        format: DataFormat,\n        structure: DataStructure\n    ) -> List[str]:\n        \"\"\"Get quality considerations\"\"\"\n        considerations = []\n\n        if structure == DataStructure.STRUCTURED:\n            considerations.extend([\n                \"Validate schema consistency\",\n                \"Check for null/missing values\",\n                \"Verify data types\"\n            ])\n        elif structure == DataStructure.UNSTRUCTURED:\n            considerations.extend([\n                \"OCR accuracy verification\",\n                \"Text encoding issues\",\n                \"Content extraction completeness\"\n            ])\n        elif structure == DataStructure.GEOMETRIC:\n            considerations.extend([\n                \"Model validity (closed solids)\",\n                \"Coordinate system consistency\",\n                \"Unit verification\"\n            ])\n\n        # Format-specific\n        if format == DataFormat.IFC:\n            considerations.append(\"IFC schema version compatibility\")\n        elif format == DataFormat.EXCEL:\n            considerations.append(\"Formula vs value extraction\")\n\n        return considerations\n\n    def _calculate_confidence(\n        self,\n        extension: Optional[str],\n        sample: Optional[Any],\n        metadata: Optional[Dict]\n    ) -> float:\n        \"\"\"Calculate classification confidence\"\"\"\n        confidence = 0.5  # Base confidence\n\n        if extension:\n            confidence += 0.3  # Extension provides good hint\n        if sample:\n            confidence += 0.15  # Sample data helps\n        if metadata:\n            confidence += 0.05  # Metadata adds context\n\n        return min(1.0, confidence)\n\n    def classify_multiple(\n        self,\n        sources: List[Dict]\n    ) -> ClassificationReport:\n        \"\"\"\n        Classify multiple data sources.\n\n        Args:\n            sources: List of source definitions\n\n        Returns:\n            Complete classification report\n        \"\"\"\n        classifications = []\n\n        for source in sources:\n            classification = self.classify_source(\n                source_name=source[\"name\"],\n                source_type=source.get(\"type\", \"file\"),\n                file_extension=source.get(\"extension\"),\n                sample_data=source.get(\"sample\"),\n                metadata=source.get(\"metadata\")\n            )\n            classifications.append(classification)\n\n        # Generate summaries\n        summary_structure = {}\n        summary_format = {}\n        storage_recs = {}\n\n        for c in classifications:\n            # Structure summary\n            struct = c.structure.value\n            summary_structure[struct] = summary_structure.get(struct, 0) + 1\n\n            # Format summary\n            fmt = c.detected_format.value\n            summary_format[fmt] = summary_format.get(fmt, 0) + 1\n\n            # Storage recommendations\n            storage = c.storage_recommendation.value\n            if storage not in storage_recs:\n                storage_recs[storage] = []\n            storage_recs[storage].append(c.source_name)\n\n        # Integration strategy\n        strategy = self._generate_integration_strategy(classifications)\n\n        return ClassificationReport(\n            total_sources=len(sources),\n            classifications=classifications,\n            summary_by_structure=summary_structure,\n            summary_by_format=summary_format,\n            storage_recommendations=storage_recs,\n            integration_strategy=strategy\n        )\n\n    def _generate_integration_strategy(\n        self,\n        classifications: List[DataClassification]\n    ) -> Dict[str, str]:\n        \"\"\"Generate integration strategy\"\"\"\n        strategy = {}\n\n        # Group by structure\n        structured = [c for c in classifications if c.structure == DataStructure.STRUCTURED]\n        semi = [c for c in classifications if c.structure == DataStructure.SEMI_STRUCTURED]\n        unstructured = [c for c in classifications if c.structure == DataStructure.UNSTRUCTURED]\n        geometric = [c for c in classifications if c.structure == DataStructure.GEOMETRIC]\n\n        if structured:\n            strategy[\"structured_data\"] = (\n                \"Use ETL pipeline to consolidate into central data warehouse. \"\n                \"Implement SQL-based querying and reporting.\"\n            )\n\n        if semi:\n            strategy[\"semi_structured_data\"] = (\n                \"Use document database for flexible storage. \"\n                \"Implement schema validation at ingestion.\"\n            )\n\n        if unstructured:\n            strategy[\"unstructured_data\"] = (\n                \"Extract text content using OCR/NLP. \"\n                \"Store in vector database for semantic search.\"\n            )\n\n        if geometric:\n            strategy[\"geometric_data\"] = (\n                \"Standardize on IFC format for exchange. \"\n                \"Maintain native formats for editing.\"\n            )\n\n        return strategy\n\n    def generate_report(self, report: ClassificationReport) -> str:\n        \"\"\"Generate classification report\"\"\"\n        output = f\"\"\"\n# Data Classification Report\n\n**Total Sources Analyzed:** {report.total_sources}\n\n## Summary by Structure\n\n\"\"\"\n        for struct, count in report.summary_by_structure.items():\n            output += f\"- **{struct.title()}**: {count} sources\\n\"\n\n        output += \"\\n## Summary by Format\\n\\n\"\n        for fmt, count in report.summary_by_format.items():\n            output += f\"- **{fmt.upper()}**: {count} sources\\n\"\n\n        output += \"\\n## Storage Recommendations\\n\\n\"\n        for storage, sources in report.storage_recommendations.items():\n            output += f\"### {storage.replace('_', ' ').title()}\\n\"\n            for src in sources:\n                output += f\"- {src}\\n\"\n            output += \"\\n\"\n\n        output += \"## Integration Strategy\\n\\n\"\n        for category, strategy in report.integration_strategy.items():\n            output += f\"### {category.replace('_', ' ').title()}\\n{strategy}\\n\\n\"\n\n        output += \"## Detailed Classifications\\n\\n\"\n        for c in report.classifications[:10]:\n            output += f\"\"\"\n### {c.source_name}\n- **Format:** {c.detected_format.value}\n- **Structure:** {c.structure.value}\n- **Storage:** {c.storage_recommendation.value}\n- **Tools:** {', '.join(c.processing_tools[:3])}\n- **Confidence:** {c.confidence:.0%}\n\"\"\"\n\n        return output"
      },
      {
        "title": "Classify Single Data Source",
        "body": "classifier = DataTypeClassifier()\n\n# Classify a BIM model\nclassification = classifier.classify_source(\n    source_name=\"Building Model\",\n    source_type=\"file\",\n    file_extension=\".ifc\",\n    metadata={\"volume\": \"large\"}\n)\n\nprint(f\"Format: {classification.detected_format.value}\")\nprint(f\"Structure: {classification.structure.value}\")\nprint(f\"Storage: {classification.storage_recommendation.value}\")\nprint(f\"Tools: {classification.processing_tools}\")"
      },
      {
        "title": "Classify Multiple Sources",
        "body": "sources = [\n    {\"name\": \"Cost Database\", \"type\": \"database\", \"extension\": \".sql\"},\n    {\"name\": \"Building Model\", \"type\": \"file\", \"extension\": \".ifc\"},\n    {\"name\": \"Contract PDFs\", \"type\": \"file\", \"extension\": \".pdf\"},\n    {\"name\": \"Site Photos\", \"type\": \"file\", \"extension\": \".jpg\"},\n    {\"name\": \"Schedule\", \"type\": \"file\", \"extension\": \".mpp\"}\n]\n\nreport = classifier.classify_multiple(sources)\n\nprint(f\"Total: {report.total_sources}\")\nprint(f\"By structure: {report.summary_by_structure}\")"
      },
      {
        "title": "Generate Classification Report",
        "body": "report_text = classifier.generate_report(report)\nprint(report_text)\n\n# Save to file\nwith open(\"classification_report.md\", \"w\") as f:\n    f.write(report_text)"
      },
      {
        "title": "Quick Reference",
        "body": "ComponentPurposeDataTypeClassifierMain classification engineDataStructureStructure types (structured, semi, unstructured)DataFormatFile format detectionStorageRecommendationStorage system recommendationsDataClassificationClassification resultClassificationReportMulti-source report"
      },
      {
        "title": "Resources",
        "body": "Book: \"Data-Driven Construction\" by Artem Boiko, Chapter 2.1\nWebsite: https://datadrivenconstruction.io"
      },
      {
        "title": "Next Steps",
        "body": "Use sql-query-builder for structured data queries\nUse pdf-to-structured for unstructured data\nUse data-model-designer for schema design"
      }
    ],
    "body": "Data Type Classifier\nOverview\n\nBased on DDC methodology (Chapter 2.1), this skill classifies construction data by type, analyzes data sources, and recommends appropriate storage, processing, and integration methods.\n\nBook Reference: \"Типы данных в строительстве\" / \"Data Types in Construction\"\n\nQuick Start\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import List, Dict, Optional, Any, Tuple\nfrom datetime import datetime\nimport json\nimport re\nimport mimetypes\n\nclass DataStructure(Enum):\n    \"\"\"Data structure classification\"\"\"\n    STRUCTURED = \"structured\"           # Tables, databases, spreadsheets\n    SEMI_STRUCTURED = \"semi_structured\" # JSON, XML, IFC\n    UNSTRUCTURED = \"unstructured\"       # Documents, images, videos\n    GEOMETRIC = \"geometric\"             # CAD, BIM geometry\n    TEMPORAL = \"temporal\"               # Time-series, schedules\n    SPATIAL = \"spatial\"                 # GIS, coordinates\n\nclass DataFormat(Enum):\n    \"\"\"Common construction data formats\"\"\"\n    # Structured\n    CSV = \"csv\"\n    EXCEL = \"excel\"\n    SQL = \"sql\"\n    PARQUET = \"parquet\"\n\n    # Semi-structured\n    JSON = \"json\"\n    XML = \"xml\"\n    IFC = \"ifc\"\n    BCF = \"bcf\"\n\n    # Unstructured\n    PDF = \"pdf\"\n    DOCX = \"docx\"\n    IMAGE = \"image\"\n    VIDEO = \"video\"\n\n    # Geometric\n    DWG = \"dwg\"\n    DXF = \"dxf\"\n    RVT = \"rvt\"\n    NWD = \"nwd\"\n    OBJ = \"obj\"\n    STL = \"stl\"\n\n    # Schedule\n    MPP = \"mpp\"\n    P6 = \"p6\"\n    XER = \"xer\"\n\nclass StorageRecommendation(Enum):\n    \"\"\"Storage system recommendations\"\"\"\n    RELATIONAL_DB = \"relational_database\"\n    DOCUMENT_DB = \"document_database\"\n    OBJECT_STORAGE = \"object_storage\"\n    GRAPH_DB = \"graph_database\"\n    TIME_SERIES_DB = \"time_series_database\"\n    VECTOR_DB = \"vector_database\"\n    FILE_SYSTEM = \"file_system\"\n    DATA_LAKE = \"data_lake\"\n\n@dataclass\nclass DataCharacteristics:\n    \"\"\"Characteristics of a data source\"\"\"\n    has_schema: bool\n    has_relationships: bool\n    is_queryable: bool\n    is_binary: bool\n    has_geometry: bool\n    has_temporal: bool\n    has_text_content: bool\n    avg_record_size: Optional[int] = None  # bytes\n    estimated_volume: Optional[str] = None  # small/medium/large/huge\n    update_frequency: Optional[str] = None\n\n@dataclass\nclass DataClassification:\n    \"\"\"Classification result for a data source\"\"\"\n    source_name: str\n    source_type: str\n    detected_format: DataFormat\n    structure: DataStructure\n    characteristics: DataCharacteristics\n    storage_recommendation: StorageRecommendation\n    processing_tools: List[str]\n    integration_options: List[str]\n    quality_considerations: List[str]\n    confidence: float\n\n@dataclass\nclass ClassificationReport:\n    \"\"\"Complete classification report\"\"\"\n    total_sources: int\n    classifications: List[DataClassification]\n    summary_by_structure: Dict[str, int]\n    summary_by_format: Dict[str, int]\n    storage_recommendations: Dict[str, List[str]]\n    integration_strategy: Dict[str, str]\n\n\nclass DataTypeClassifier:\n    \"\"\"\n    Classify construction data by type and recommend processing methods.\n    Based on DDC methodology Chapter 2.1.\n    \"\"\"\n\n    def __init__(self):\n        self.format_signatures = self._define_format_signatures()\n        self.structure_mapping = self._define_structure_mapping()\n        self.storage_mapping = self._define_storage_mapping()\n        self.processing_tools = self._define_processing_tools()\n\n    def _define_format_signatures(self) -> Dict[str, Dict]:\n        \"\"\"Define format detection signatures\"\"\"\n        return {\n            # File extensions\n            \".csv\": {\"format\": DataFormat.CSV, \"structure\": DataStructure.STRUCTURED},\n            \".xlsx\": {\"format\": DataFormat.EXCEL, \"structure\": DataStructure.STRUCTURED},\n            \".xls\": {\"format\": DataFormat.EXCEL, \"structure\": DataStructure.STRUCTURED},\n            \".json\": {\"format\": DataFormat.JSON, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".xml\": {\"format\": DataFormat.XML, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".ifc\": {\"format\": DataFormat.IFC, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".bcf\": {\"format\": DataFormat.BCF, \"structure\": DataStructure.SEMI_STRUCTURED},\n            \".pdf\": {\"format\": DataFormat.PDF, \"structure\": DataStructure.UNSTRUCTURED},\n            \".docx\": {\"format\": DataFormat.DOCX, \"structure\": DataStructure.UNSTRUCTURED},\n            \".dwg\": {\"format\": DataFormat.DWG, \"structure\": DataStructure.GEOMETRIC},\n            \".dxf\": {\"format\": DataFormat.DXF, \"structure\": DataStructure.GEOMETRIC},\n            \".rvt\": {\"format\": DataFormat.RVT, \"structure\": DataStructure.GEOMETRIC},\n            \".nwd\": {\"format\": DataFormat.NWD, \"structure\": DataStructure.GEOMETRIC},\n            \".mpp\": {\"format\": DataFormat.MPP, \"structure\": DataStructure.TEMPORAL},\n            \".xer\": {\"format\": DataFormat.XER, \"structure\": DataStructure.TEMPORAL},\n            \".parquet\": {\"format\": DataFormat.PARQUET, \"structure\": DataStructure.STRUCTURED},\n            \".jpg\": {\"format\": DataFormat.IMAGE, \"structure\": DataStructure.UNSTRUCTURED},\n            \".png\": {\"format\": DataFormat.IMAGE, \"structure\": DataStructure.UNSTRUCTURED},\n            \".mp4\": {\"format\": DataFormat.VIDEO, \"structure\": DataStructure.UNSTRUCTURED}\n        }\n\n    def _define_structure_mapping(self) -> Dict[DataStructure, Dict]:\n        \"\"\"Define characteristics for each structure type\"\"\"\n        return {\n            DataStructure.STRUCTURED: {\n                \"description\": \"Tabular data with fixed schema\",\n                \"examples\": [\"Cost databases\", \"Material lists\", \"Vendor records\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.SEMI_STRUCTURED: {\n                \"description\": \"Hierarchical data with flexible schema\",\n                \"examples\": [\"BIM models (IFC)\", \"API responses\", \"Configuration files\"],\n                \"query_support\": True,\n                \"schema_required\": False\n            },\n            DataStructure.UNSTRUCTURED: {\n                \"description\": \"No predefined schema or format\",\n                \"examples\": [\"Contracts\", \"Photos\", \"Emails\", \"Meeting notes\"],\n                \"query_support\": False,\n                \"schema_required\": False\n            },\n            DataStructure.GEOMETRIC: {\n                \"description\": \"3D/2D geometric and spatial data\",\n                \"examples\": [\"CAD drawings\", \"BIM geometry\", \"Point clouds\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.TEMPORAL: {\n                \"description\": \"Time-based sequential data\",\n                \"examples\": [\"Schedules\", \"Progress data\", \"Sensor readings\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            },\n            DataStructure.SPATIAL: {\n                \"description\": \"Geographic and location data\",\n                \"examples\": [\"Site maps\", \"GPS tracks\", \"GIS layers\"],\n                \"query_support\": True,\n                \"schema_required\": True\n            }\n        }\n\n    def _define_storage_mapping(self) -> Dict[DataStructure, StorageRecommendation]:\n        \"\"\"Map data structures to storage recommendations\"\"\"\n        return {\n            DataStructure.STRUCTURED: StorageRecommendation.RELATIONAL_DB,\n            DataStructure.SEMI_STRUCTURED: StorageRecommendation.DOCUMENT_DB,\n            DataStructure.UNSTRUCTURED: StorageRecommendation.OBJECT_STORAGE,\n            DataStructure.GEOMETRIC: StorageRecommendation.FILE_SYSTEM,\n            DataStructure.TEMPORAL: StorageRecommendation.TIME_SERIES_DB,\n            DataStructure.SPATIAL: StorageRecommendation.RELATIONAL_DB\n        }\n\n    def _define_processing_tools(self) -> Dict[DataFormat, List[str]]:\n        \"\"\"Define processing tools for each format\"\"\"\n        return {\n            DataFormat.CSV: [\"pandas\", \"polars\", \"duckdb\"],\n            DataFormat.EXCEL: [\"pandas\", \"openpyxl\", \"xlrd\"],\n            DataFormat.JSON: [\"json\", \"pandas\", \"jq\"],\n            DataFormat.XML: [\"lxml\", \"ElementTree\", \"BeautifulSoup\"],\n            DataFormat.IFC: [\"ifcopenshell\", \"IfcOpenShell\", \"xBIM\"],\n            DataFormat.BCF: [\"bcfpython\", \"ifcopenshell\"],\n            DataFormat.PDF: [\"pdfplumber\", \"PyPDF2\", \"pdf2image\"],\n            DataFormat.DOCX: [\"python-docx\", \"mammoth\"],\n            DataFormat.DWG: [\"ezdxf\", \"Teigha\", \"ODA SDK\"],\n            DataFormat.DXF: [\"ezdxf\", \"dxfgrabber\"],\n            DataFormat.RVT: [\"Revit API\", \"pyRevit\", \"Dynamo\"],\n            DataFormat.NWD: [\"Navisworks API\", \"NW API\"],\n            DataFormat.MPP: [\"mpxj\", \"Project API\"],\n            DataFormat.XER: [\"xerparser\", \"P6 API\"],\n            DataFormat.PARQUET: [\"pandas\", \"pyarrow\", \"polars\"],\n            DataFormat.IMAGE: [\"PIL\", \"opencv\", \"scikit-image\"],\n            DataFormat.VIDEO: [\"opencv\", \"ffmpeg\", \"moviepy\"]\n        }\n\n    def classify_source(\n        self,\n        source_name: str,\n        source_type: str,\n        file_extension: Optional[str] = None,\n        sample_data: Optional[Any] = None,\n        metadata: Optional[Dict] = None\n    ) -> DataClassification:\n        \"\"\"\n        Classify a single data source.\n\n        Args:\n            source_name: Name of the data source\n            source_type: Type (file, database, api, etc.)\n            file_extension: File extension if applicable\n            sample_data: Sample of the data for analysis\n            metadata: Additional metadata\n\n        Returns:\n            Classification result\n        \"\"\"\n        # Detect format\n        detected_format, structure = self._detect_format(\n            file_extension, source_type, sample_data\n        )\n\n        # Analyze characteristics\n        characteristics = self._analyze_characteristics(\n            detected_format, structure, sample_data, metadata\n        )\n\n        # Determine storage recommendation\n        storage = self._recommend_storage(structure, characteristics)\n\n        # Get processing tools\n        tools = self.processing_tools.get(detected_format, [])\n\n        # Determine integration options\n        integration = self._get_integration_options(detected_format, structure)\n\n        # Quality considerations\n        quality = self._get_quality_considerations(detected_format, structure)\n\n        # Calculate confidence\n        confidence = self._calculate_confidence(\n            file_extension, sample_data, metadata\n        )\n\n        return DataClassification(\n            source_name=source_name,\n            source_type=source_type,\n            detected_format=detected_format,\n            structure=structure,\n            characteristics=characteristics,\n            storage_recommendation=storage,\n            processing_tools=tools,\n            integration_options=integration,\n            quality_considerations=quality,\n            confidence=confidence\n        )\n\n    def _detect_format(\n        self,\n        extension: Optional[str],\n        source_type: str,\n        sample: Optional[Any]\n    ) -> Tuple[DataFormat, DataStructure]:\n        \"\"\"Detect data format and structure\"\"\"\n        # Check file extension\n        if extension:\n            ext = extension.lower() if extension.startswith('.') else f\".{extension.lower()}\"\n            if ext in self.format_signatures:\n                sig = self.format_signatures[ext]\n                return sig[\"format\"], sig[\"structure\"]\n\n        # Check source type\n        if source_type == \"database\":\n            return DataFormat.SQL, DataStructure.STRUCTURED\n        elif source_type == \"api\":\n            return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n        # Analyze sample data\n        if sample:\n            if isinstance(sample, dict):\n                return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n            elif isinstance(sample, list) and all(isinstance(x, dict) for x in sample):\n                return DataFormat.JSON, DataStructure.STRUCTURED\n            elif isinstance(sample, str):\n                if sample.strip().startswith('<'):\n                    return DataFormat.XML, DataStructure.SEMI_STRUCTURED\n                elif sample.strip().startswith('{'):\n                    return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n        # Default\n        return DataFormat.JSON, DataStructure.SEMI_STRUCTURED\n\n    def _analyze_characteristics(\n        self,\n        format: DataFormat,\n        structure: DataStructure,\n        sample: Optional[Any],\n        metadata: Optional[Dict]\n    ) -> DataCharacteristics:\n        \"\"\"Analyze data characteristics\"\"\"\n        return DataCharacteristics(\n            has_schema=structure in [DataStructure.STRUCTURED, DataStructure.TEMPORAL],\n            has_relationships=format in [DataFormat.IFC, DataFormat.SQL],\n            is_queryable=structure != DataStructure.UNSTRUCTURED,\n            is_binary=format in [\n                DataFormat.DWG, DataFormat.RVT, DataFormat.NWD,\n                DataFormat.IMAGE, DataFormat.VIDEO, DataFormat.PDF\n            ],\n            has_geometry=structure == DataStructure.GEOMETRIC or format == DataFormat.IFC,\n            has_temporal=structure == DataStructure.TEMPORAL,\n            has_text_content=format in [\n                DataFormat.PDF, DataFormat.DOCX, DataFormat.CSV\n            ],\n            estimated_volume=metadata.get(\"volume\") if metadata else None,\n            update_frequency=metadata.get(\"update_frequency\") if metadata else None\n        )\n\n    def _recommend_storage(\n        self,\n        structure: DataStructure,\n        characteristics: DataCharacteristics\n    ) -> StorageRecommendation:\n        \"\"\"Recommend storage solution\"\"\"\n        # Special cases\n        if characteristics.has_text_content and not characteristics.has_schema:\n            return StorageRecommendation.VECTOR_DB\n\n        if characteristics.is_binary and characteristics.estimated_volume == \"huge\":\n            return StorageRecommendation.OBJECT_STORAGE\n\n        if characteristics.has_relationships:\n            return StorageRecommendation.GRAPH_DB\n\n        # Default mapping\n        return self.storage_mapping.get(structure, StorageRecommendation.FILE_SYSTEM)\n\n    def _get_integration_options(\n        self,\n        format: DataFormat,\n        structure: DataStructure\n    ) -> List[str]:\n        \"\"\"Get integration options for the data\"\"\"\n        options = []\n\n        if structure == DataStructure.STRUCTURED:\n            options.extend([\"Direct SQL queries\", \"ETL pipelines\", \"API export\"])\n        elif structure == DataStructure.SEMI_STRUCTURED:\n            options.extend([\"JSON/XML parsing\", \"Schema validation\", \"API integration\"])\n        elif structure == DataStructure.UNSTRUCTURED:\n            options.extend([\"OCR extraction\", \"NLP processing\", \"ML classification\"])\n        elif structure == DataStructure.GEOMETRIC:\n            options.extend([\"IFC export\", \"Geometry extraction\", \"Clash detection\"])\n\n        # Format-specific options\n        if format == DataFormat.IFC:\n            options.append(\"IFC import/export via IfcOpenShell\")\n        elif format == DataFormat.EXCEL:\n            options.append(\"Pandas DataFrame conversion\")\n        elif format == DataFormat.PDF:\n            options.append(\"PDF text/table extraction\")\n\n        return options\n\n    def _get_quality_considerations(\n        self,\n        format: DataFormat,\n        structure: DataStructure\n    ) -> List[str]:\n        \"\"\"Get quality considerations\"\"\"\n        considerations = []\n\n        if structure == DataStructure.STRUCTURED:\n            considerations.extend([\n                \"Validate schema consistency\",\n                \"Check for null/missing values\",\n                \"Verify data types\"\n            ])\n        elif structure == DataStructure.UNSTRUCTURED:\n            considerations.extend([\n                \"OCR accuracy verification\",\n                \"Text encoding issues\",\n                \"Content extraction completeness\"\n            ])\n        elif structure == DataStructure.GEOMETRIC:\n            considerations.extend([\n                \"Model validity (closed solids)\",\n                \"Coordinate system consistency\",\n                \"Unit verification\"\n            ])\n\n        # Format-specific\n        if format == DataFormat.IFC:\n            considerations.append(\"IFC schema version compatibility\")\n        elif format == DataFormat.EXCEL:\n            considerations.append(\"Formula vs value extraction\")\n\n        return considerations\n\n    def _calculate_confidence(\n        self,\n        extension: Optional[str],\n        sample: Optional[Any],\n        metadata: Optional[Dict]\n    ) -> float:\n        \"\"\"Calculate classification confidence\"\"\"\n        confidence = 0.5  # Base confidence\n\n        if extension:\n            confidence += 0.3  # Extension provides good hint\n        if sample:\n            confidence += 0.15  # Sample data helps\n        if metadata:\n            confidence += 0.05  # Metadata adds context\n\n        return min(1.0, confidence)\n\n    def classify_multiple(\n        self,\n        sources: List[Dict]\n    ) -> ClassificationReport:\n        \"\"\"\n        Classify multiple data sources.\n\n        Args:\n            sources: List of source definitions\n\n        Returns:\n            Complete classification report\n        \"\"\"\n        classifications = []\n\n        for source in sources:\n            classification = self.classify_source(\n                source_name=source[\"name\"],\n                source_type=source.get(\"type\", \"file\"),\n                file_extension=source.get(\"extension\"),\n                sample_data=source.get(\"sample\"),\n                metadata=source.get(\"metadata\")\n            )\n            classifications.append(classification)\n\n        # Generate summaries\n        summary_structure = {}\n        summary_format = {}\n        storage_recs = {}\n\n        for c in classifications:\n            # Structure summary\n            struct = c.structure.value\n            summary_structure[struct] = summary_structure.get(struct, 0) + 1\n\n            # Format summary\n            fmt = c.detected_format.value\n            summary_format[fmt] = summary_format.get(fmt, 0) + 1\n\n            # Storage recommendations\n            storage = c.storage_recommendation.value\n            if storage not in storage_recs:\n                storage_recs[storage] = []\n            storage_recs[storage].append(c.source_name)\n\n        # Integration strategy\n        strategy = self._generate_integration_strategy(classifications)\n\n        return ClassificationReport(\n            total_sources=len(sources),\n            classifications=classifications,\n            summary_by_structure=summary_structure,\n            summary_by_format=summary_format,\n            storage_recommendations=storage_recs,\n            integration_strategy=strategy\n        )\n\n    def _generate_integration_strategy(\n        self,\n        classifications: List[DataClassification]\n    ) -> Dict[str, str]:\n        \"\"\"Generate integration strategy\"\"\"\n        strategy = {}\n\n        # Group by structure\n        structured = [c for c in classifications if c.structure == DataStructure.STRUCTURED]\n        semi = [c for c in classifications if c.structure == DataStructure.SEMI_STRUCTURED]\n        unstructured = [c for c in classifications if c.structure == DataStructure.UNSTRUCTURED]\n        geometric = [c for c in classifications if c.structure == DataStructure.GEOMETRIC]\n\n        if structured:\n            strategy[\"structured_data\"] = (\n                \"Use ETL pipeline to consolidate into central data warehouse. \"\n                \"Implement SQL-based querying and reporting.\"\n            )\n\n        if semi:\n            strategy[\"semi_structured_data\"] = (\n                \"Use document database for flexible storage. \"\n                \"Implement schema validation at ingestion.\"\n            )\n\n        if unstructured:\n            strategy[\"unstructured_data\"] = (\n                \"Extract text content using OCR/NLP. \"\n                \"Store in vector database for semantic search.\"\n            )\n\n        if geometric:\n            strategy[\"geometric_data\"] = (\n                \"Standardize on IFC format for exchange. \"\n                \"Maintain native formats for editing.\"\n            )\n\n        return strategy\n\n    def generate_report(self, report: ClassificationReport) -> str:\n        \"\"\"Generate classification report\"\"\"\n        output = f\"\"\"\n# Data Classification Report\n\n**Total Sources Analyzed:** {report.total_sources}\n\n## Summary by Structure\n\n\"\"\"\n        for struct, count in report.summary_by_structure.items():\n            output += f\"- **{struct.title()}**: {count} sources\\n\"\n\n        output += \"\\n## Summary by Format\\n\\n\"\n        for fmt, count in report.summary_by_format.items():\n            output += f\"- **{fmt.upper()}**: {count} sources\\n\"\n\n        output += \"\\n## Storage Recommendations\\n\\n\"\n        for storage, sources in report.storage_recommendations.items():\n            output += f\"### {storage.replace('_', ' ').title()}\\n\"\n            for src in sources:\n                output += f\"- {src}\\n\"\n            output += \"\\n\"\n\n        output += \"## Integration Strategy\\n\\n\"\n        for category, strategy in report.integration_strategy.items():\n            output += f\"### {category.replace('_', ' ').title()}\\n{strategy}\\n\\n\"\n\n        output += \"## Detailed Classifications\\n\\n\"\n        for c in report.classifications[:10]:\n            output += f\"\"\"\n### {c.source_name}\n- **Format:** {c.detected_format.value}\n- **Structure:** {c.structure.value}\n- **Storage:** {c.storage_recommendation.value}\n- **Tools:** {', '.join(c.processing_tools[:3])}\n- **Confidence:** {c.confidence:.0%}\n\"\"\"\n\n        return output\n\nCommon Use Cases\nClassify Single Data Source\nclassifier = DataTypeClassifier()\n\n# Classify a BIM model\nclassification = classifier.classify_source(\n    source_name=\"Building Model\",\n    source_type=\"file\",\n    file_extension=\".ifc\",\n    metadata={\"volume\": \"large\"}\n)\n\nprint(f\"Format: {classification.detected_format.value}\")\nprint(f\"Structure: {classification.structure.value}\")\nprint(f\"Storage: {classification.storage_recommendation.value}\")\nprint(f\"Tools: {classification.processing_tools}\")\n\nClassify Multiple Sources\nsources = [\n    {\"name\": \"Cost Database\", \"type\": \"database\", \"extension\": \".sql\"},\n    {\"name\": \"Building Model\", \"type\": \"file\", \"extension\": \".ifc\"},\n    {\"name\": \"Contract PDFs\", \"type\": \"file\", \"extension\": \".pdf\"},\n    {\"name\": \"Site Photos\", \"type\": \"file\", \"extension\": \".jpg\"},\n    {\"name\": \"Schedule\", \"type\": \"file\", \"extension\": \".mpp\"}\n]\n\nreport = classifier.classify_multiple(sources)\n\nprint(f\"Total: {report.total_sources}\")\nprint(f\"By structure: {report.summary_by_structure}\")\n\nGenerate Classification Report\nreport_text = classifier.generate_report(report)\nprint(report_text)\n\n# Save to file\nwith open(\"classification_report.md\", \"w\") as f:\n    f.write(report_text)\n\nQuick Reference\nComponent\tPurpose\nDataTypeClassifier\tMain classification engine\nDataStructure\tStructure types (structured, semi, unstructured)\nDataFormat\tFile format detection\nStorageRecommendation\tStorage system recommendations\nDataClassification\tClassification result\nClassificationReport\tMulti-source report\nResources\nBook: \"Data-Driven Construction\" by Artem Boiko, Chapter 2.1\nWebsite: https://datadrivenconstruction.io\nNext Steps\nUse sql-query-builder for structured data queries\nUse pdf-to-structured for unstructured data\nUse data-model-designer for schema design"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/datadrivenconstruction/data-type-classifier",
    "publisherUrl": "https://clawhub.ai/datadrivenconstruction/data-type-classifier",
    "owner": "datadrivenconstruction",
    "version": "2.1.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/data-type-classifier",
    "downloadUrl": "https://openagent3.xyz/downloads/data-type-classifier",
    "agentUrl": "https://openagent3.xyz/skills/data-type-classifier/agent",
    "manifestUrl": "https://openagent3.xyz/skills/data-type-classifier/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/data-type-classifier/agent.md"
  }
}