{
  "schemaVersion": "1.0",
  "item": {
    "slug": "historical-cost-analyzer",
    "name": "Historical Cost Analyzer",
    "source": "tencent",
    "type": "skill",
    "category": "数据分析",
    "sourceUrl": "https://clawhub.ai/datadrivenconstruction/historical-cost-analyzer",
    "canonicalUrl": "https://clawhub.ai/datadrivenconstruction/historical-cost-analyzer",
    "targetPlatform": "OpenClaw"
  },
  "install": {
    "downloadMode": "redirect",
    "downloadUrl": "/downloads/historical-cost-analyzer",
    "sourceDownloadUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=historical-cost-analyzer",
    "sourcePlatform": "tencent",
    "targetPlatform": "OpenClaw",
    "installMethod": "Manual import",
    "extraction": "Extract archive",
    "prerequisites": [
      "OpenClaw"
    ],
    "packageFormat": "ZIP package",
    "includedAssets": [
      "claw.json",
      "instructions.md",
      "SKILL.md"
    ],
    "primaryDoc": "SKILL.md",
    "quickSetup": [
      "Download the package from Yavira.",
      "Extract the archive and review SKILL.md first.",
      "Import or place the package into your OpenClaw setup."
    ],
    "agentAssist": {
      "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
      "steps": [
        "Download the package from Yavira.",
        "Extract it into a folder your agent can access.",
        "Paste one of the prompts below and point your agent at the extracted folder."
      ],
      "prompts": [
        {
          "label": "New install",
          "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
        },
        {
          "label": "Upgrade existing",
          "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
        }
      ]
    },
    "sourceHealth": {
      "source": "tencent",
      "status": "healthy",
      "reason": "direct_download_ok",
      "recommendedAction": "download",
      "checkedAt": "2026-04-23T16:43:11.935Z",
      "expiresAt": "2026-04-30T16:43:11.935Z",
      "httpStatus": 200,
      "finalUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
      "contentType": "application/zip",
      "probeMethod": "head",
      "details": {
        "probeUrl": "https://wry-manatee-359.convex.site/api/v1/download?slug=4claw-imageboard",
        "contentDisposition": "attachment; filename=\"4claw-imageboard-1.0.1.zip\"",
        "redirectLocation": null,
        "bodySnippet": null
      },
      "scope": "source",
      "summary": "Source download looks usable.",
      "detail": "Yavira can redirect you to the upstream package for this source.",
      "primaryActionLabel": "Download for OpenClaw",
      "primaryActionHref": "/downloads/historical-cost-analyzer"
    },
    "validation": {
      "installChecklist": [
        "Use the Yavira download entry.",
        "Review SKILL.md after the package is downloaded.",
        "Confirm the extracted package contains the expected setup assets."
      ],
      "postInstallChecks": [
        "Confirm the extracted package includes the expected docs or setup files.",
        "Validate the skill or prompts are available in your target agent workspace.",
        "Capture any manual follow-up steps the agent could not complete."
      ]
    },
    "downloadPageUrl": "https://openagent3.xyz/downloads/historical-cost-analyzer",
    "agentPageUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent.md"
  },
  "agentAssist": {
    "summary": "Hand the extracted package to your coding agent with a concrete install brief instead of figuring it out manually.",
    "steps": [
      "Download the package from Yavira.",
      "Extract it into a folder your agent can access.",
      "Paste one of the prompts below and point your agent at the extracted folder."
    ],
    "prompts": [
      {
        "label": "New install",
        "body": "I downloaded a skill package from Yavira. Read SKILL.md from the extracted folder and install it by following the included instructions. Tell me what you changed and call out any manual steps you could not complete."
      },
      {
        "label": "Upgrade existing",
        "body": "I downloaded an updated skill package from Yavira. Read SKILL.md from the extracted folder, compare it with my current installation, and upgrade it while preserving any custom configuration unless the package docs explicitly say otherwise. Summarize what changed and any follow-up checks I should run."
      }
    ]
  },
  "documentation": {
    "source": "clawhub",
    "primaryDoc": "SKILL.md",
    "sections": [
      {
        "title": "Overview",
        "body": "Analyze historical construction cost data for benchmarking, escalation tracking, and estimating calibration. Compare similar projects, identify cost drivers, and improve future estimates."
      },
      {
        "title": "Business Case",
        "body": "Historical cost analysis enables:\n\nBenchmarking: Compare current estimates to past projects\nCalibration: Improve estimating accuracy using actual data\nTrends: Track cost escalation and market changes\nRisk Assessment: Identify cost drivers and overrun patterns"
      },
      {
        "title": "Technical Implementation",
        "body": "from dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Tuple\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime\nfrom scipy import stats\n\n@dataclass\nclass CostBenchmark:\n    metric_name: str\n    value: float\n    unit: str\n    percentile_25: float\n    percentile_50: float\n    percentile_75: float\n    sample_size: int\n    project_types: List[str]\n\n@dataclass\nclass EscalationAnalysis:\n    from_year: int\n    to_year: int\n    annual_rate: float\n    total_change: float\n    category: str\n    confidence: float\n\n@dataclass\nclass CostDriver:\n    factor: str\n    impact_percentage: float\n    correlation: float\n    description: str\n\nclass HistoricalCostAnalyzer:\n    \"\"\"Analyze historical construction costs.\"\"\"\n\n    # RSMeans City Cost Indexes (sample - would be loaded from database)\n    LOCATION_FACTORS = {\n        'New York': 1.32, 'San Francisco': 1.28, 'Los Angeles': 1.15,\n        'Chicago': 1.12, 'Houston': 0.92, 'Dallas': 0.89,\n        'Phoenix': 0.93, 'Atlanta': 0.91, 'Denver': 1.02,\n        'Seattle': 1.08, 'National Average': 1.00\n    }\n\n    # Historical cost indices by year\n    COST_INDICES = {\n        2015: 100.0, 2016: 102.1, 2017: 105.3, 2018: 109.2,\n        2019: 112.5, 2020: 114.8, 2021: 121.4, 2022: 135.6,\n        2023: 142.3, 2024: 148.7, 2025: 154.2, 2026: 160.0\n    }\n\n    def __init__(self, historical_data: pd.DataFrame = None):\n        self.data = historical_data\n        self.benchmarks: Dict[str, CostBenchmark] = {}\n\n    def load_data(self, data: pd.DataFrame):\n        \"\"\"Load historical project data.\"\"\"\n        self.data = data.copy()\n\n        # Normalize data\n        if 'completion_year' not in self.data.columns and 'completion_date' in self.data.columns:\n            self.data['completion_year'] = pd.to_datetime(self.data['completion_date']).dt.year\n\n        # Calculate key metrics\n        if 'gross_area' in self.data.columns and 'final_cost' in self.data.columns:\n            self.data['cost_per_sf'] = self.data['final_cost'] / self.data['gross_area']\n\n        if 'original_estimate' in self.data.columns and 'final_cost' in self.data.columns:\n            self.data['overrun_pct'] = ((self.data['final_cost'] - self.data['original_estimate'])\n                                         / self.data['original_estimate'] * 100)\n\n    def normalize_to_year(self, costs: pd.Series, from_years: pd.Series,\n                          to_year: int = 2026) -> pd.Series:\n        \"\"\"Normalize costs to a common year using cost indices.\"\"\"\n        normalized = costs.copy()\n\n        for i, (cost, year) in enumerate(zip(costs, from_years)):\n            if pd.notna(cost) and pd.notna(year):\n                year = int(year)\n                if year in self.COST_INDICES and to_year in self.COST_INDICES:\n                    factor = self.COST_INDICES[to_year] / self.COST_INDICES[year]\n                    normalized.iloc[i] = cost * factor\n\n        return normalized\n\n    def normalize_to_location(self, costs: pd.Series, locations: pd.Series,\n                               to_location: str = 'National Average') -> pd.Series:\n        \"\"\"Normalize costs to a common location.\"\"\"\n        normalized = costs.copy()\n        to_factor = self.LOCATION_FACTORS.get(to_location, 1.0)\n\n        for i, (cost, loc) in enumerate(zip(costs, locations)):\n            if pd.notna(cost) and loc in self.LOCATION_FACTORS:\n                from_factor = self.LOCATION_FACTORS[loc]\n                normalized.iloc[i] = cost * (to_factor / from_factor)\n\n        return normalized\n\n    def calculate_benchmarks(self, project_type: str = None,\n                              year_range: Tuple[int, int] = None) -> Dict[str, CostBenchmark]:\n        \"\"\"Calculate cost benchmarks from historical data.\"\"\"\n        df = self.data.copy()\n\n        # Filter by project type\n        if project_type and 'project_type' in df.columns:\n            df = df[df['project_type'] == project_type]\n\n        # Filter by year range\n        if year_range and 'completion_year' in df.columns:\n            df = df[(df['completion_year'] >= year_range[0]) &\n                    (df['completion_year'] <= year_range[1])]\n\n        benchmarks = {}\n\n        # Cost per SF\n        if 'cost_per_sf' in df.columns:\n            values = df['cost_per_sf'].dropna()\n            if len(values) > 0:\n                benchmarks['cost_per_sf'] = CostBenchmark(\n                    metric_name='Cost per SF',\n                    value=values.median(),\n                    unit='$/SF',\n                    percentile_25=values.quantile(0.25),\n                    percentile_50=values.quantile(0.50),\n                    percentile_75=values.quantile(0.75),\n                    sample_size=len(values),\n                    project_types=[project_type] if project_type else df['project_type'].unique().tolist()\n                )\n\n        # Overrun percentage\n        if 'overrun_pct' in df.columns:\n            values = df['overrun_pct'].dropna()\n            if len(values) > 0:\n                benchmarks['overrun_pct'] = CostBenchmark(\n                    metric_name='Cost Overrun',\n                    value=values.median(),\n                    unit='%',\n                    percentile_25=values.quantile(0.25),\n                    percentile_50=values.quantile(0.50),\n                    percentile_75=values.quantile(0.75),\n                    sample_size=len(values),\n                    project_types=[project_type] if project_type else df['project_type'].unique().tolist()\n                )\n\n        self.benchmarks.update(benchmarks)\n        return benchmarks\n\n    def calculate_escalation(self, category: str = 'overall',\n                              from_year: int = 2020,\n                              to_year: int = 2026) -> EscalationAnalysis:\n        \"\"\"Calculate cost escalation between years.\"\"\"\n        if from_year in self.COST_INDICES and to_year in self.COST_INDICES:\n            from_index = self.COST_INDICES[from_year]\n            to_index = self.COST_INDICES[to_year]\n\n            total_change = (to_index - from_index) / from_index\n            years = to_year - from_year\n            annual_rate = (to_index / from_index) ** (1 / years) - 1 if years > 0 else 0\n\n            return EscalationAnalysis(\n                from_year=from_year,\n                to_year=to_year,\n                annual_rate=annual_rate,\n                total_change=total_change,\n                category=category,\n                confidence=0.95\n            )\n\n        return None\n\n    def identify_cost_drivers(self, target_col: str = 'cost_per_sf') -> List[CostDriver]:\n        \"\"\"Identify factors that drive costs.\"\"\"\n        if self.data is None or target_col not in self.data.columns:\n            return []\n\n        drivers = []\n        target = self.data[target_col].dropna()\n\n        # Analyze numeric columns\n        numeric_cols = self.data.select_dtypes(include=[np.number]).columns\n        exclude = [target_col, 'final_cost', 'original_estimate']\n\n        for col in numeric_cols:\n            if col not in exclude:\n                valid_mask = self.data[col].notna() & self.data[target_col].notna()\n                if valid_mask.sum() > 10:\n                    corr, p_value = stats.pearsonr(\n                        self.data.loc[valid_mask, col],\n                        self.data.loc[valid_mask, target_col]\n                    )\n\n                    if abs(corr) > 0.3 and p_value < 0.05:\n                        impact = corr * self.data[col].std() / target.std() * 100\n\n                        drivers.append(CostDriver(\n                            factor=col,\n                            impact_percentage=abs(impact),\n                            correlation=corr,\n                            description=f\"{'Positive' if corr > 0 else 'Negative'} correlation with {target_col}\"\n                        ))\n\n        # Analyze categorical columns\n        categorical_cols = self.data.select_dtypes(include=['object', 'category']).columns\n\n        for col in categorical_cols:\n            if col not in ['project_id', 'project_name']:\n                groups = self.data.groupby(col)[target_col].mean()\n                if len(groups) > 1:\n                    variance = groups.var()\n                    overall_var = target.var()\n\n                    if variance / overall_var > 0.1:\n                        drivers.append(CostDriver(\n                            factor=col,\n                            impact_percentage=variance / overall_var * 100,\n                            correlation=0,\n                            description=f\"Categorical factor with significant cost variation\"\n                        ))\n\n        return sorted(drivers, key=lambda x: -x.impact_percentage)\n\n    def compare_to_benchmark(self, estimate: Dict, project_type: str = None) -> Dict:\n        \"\"\"Compare an estimate to historical benchmarks.\"\"\"\n        if project_type:\n            self.calculate_benchmarks(project_type)\n\n        comparison = {}\n\n        # Cost per SF comparison\n        if 'cost_per_sf' in estimate and 'cost_per_sf' in self.benchmarks:\n            benchmark = self.benchmarks['cost_per_sf']\n            value = estimate['cost_per_sf']\n\n            percentile = stats.percentileofscore(\n                self.data['cost_per_sf'].dropna(), value\n            )\n\n            comparison['cost_per_sf'] = {\n                'estimate': value,\n                'benchmark_median': benchmark.value,\n                'benchmark_range': (benchmark.percentile_25, benchmark.percentile_75),\n                'percentile': percentile,\n                'status': 'within_range' if benchmark.percentile_25 <= value <= benchmark.percentile_75 else 'outside_range'\n            }\n\n        return comparison\n\n    def find_similar_projects(self, criteria: Dict, n: int = 10) -> pd.DataFrame:\n        \"\"\"Find similar historical projects.\"\"\"\n        df = self.data.copy()\n\n        # Filter by criteria\n        if 'project_type' in criteria:\n            df = df[df['project_type'] == criteria['project_type']]\n\n        if 'gross_area' in criteria:\n            target = criteria['gross_area']\n            tolerance = criteria.get('area_tolerance', 0.3)\n            df = df[(df['gross_area'] >= target * (1 - tolerance)) &\n                    (df['gross_area'] <= target * (1 + tolerance))]\n\n        if 'location' in criteria and 'location' in df.columns:\n            df = df[df['location'] == criteria['location']]\n\n        if 'year_range' in criteria:\n            df = df[(df['completion_year'] >= criteria['year_range'][0]) &\n                    (df['completion_year'] <= criteria['year_range'][1])]\n\n        # Sort by similarity (simple: by area difference)\n        if 'gross_area' in criteria and 'gross_area' in df.columns:\n            df['similarity'] = 1 - abs(df['gross_area'] - criteria['gross_area']) / criteria['gross_area']\n            df = df.sort_values('similarity', ascending=False)\n\n        return df.head(n)\n\n    def analyze_overrun_patterns(self) -> Dict:\n        \"\"\"Analyze patterns in cost overruns.\"\"\"\n        if 'overrun_pct' not in self.data.columns:\n            return {}\n\n        analysis = {}\n\n        # Overall statistics\n        overruns = self.data['overrun_pct'].dropna()\n        analysis['overall'] = {\n            'mean': overruns.mean(),\n            'median': overruns.median(),\n            'std': overruns.std(),\n            'projects_over_budget': (overruns > 0).sum(),\n            'projects_under_budget': (overruns < 0).sum(),\n            'pct_over_budget': (overruns > 0).mean() * 100\n        }\n\n        # By project type\n        if 'project_type' in self.data.columns:\n            by_type = self.data.groupby('project_type')['overrun_pct'].agg(['mean', 'std', 'count'])\n            analysis['by_type'] = by_type.to_dict('index')\n\n        # By size category\n        if 'gross_area' in self.data.columns:\n            self.data['size_category'] = pd.cut(\n                self.data['gross_area'],\n                bins=[0, 10000, 50000, 100000, np.inf],\n                labels=['Small (<10k SF)', 'Medium (10-50k SF)', 'Large (50-100k SF)', 'Very Large (>100k SF)']\n            )\n            by_size = self.data.groupby('size_category')['overrun_pct'].agg(['mean', 'std', 'count'])\n            analysis['by_size'] = by_size.to_dict('index')\n\n        return analysis\n\n    def generate_report(self, project_type: str = None) -> str:\n        \"\"\"Generate comprehensive cost analysis report.\"\"\"\n        lines = [\"# Historical Cost Analysis Report\", \"\"]\n        lines.append(f\"**Generated:** {datetime.now().strftime('%Y-%m-%d')}\")\n        lines.append(f\"**Projects Analyzed:** {len(self.data):,}\")\n        if project_type:\n            lines.append(f\"**Project Type:** {project_type}\")\n        lines.append(\"\")\n\n        # Benchmarks\n        benchmarks = self.calculate_benchmarks(project_type)\n        if benchmarks:\n            lines.append(\"## Cost Benchmarks\")\n            for name, bm in benchmarks.items():\n                lines.append(f\"\\n### {bm.metric_name}\")\n                lines.append(f\"- **Median:** {bm.value:.2f} {bm.unit}\")\n                lines.append(f\"- **25th Percentile:** {bm.percentile_25:.2f} {bm.unit}\")\n                lines.append(f\"- **75th Percentile:** {bm.percentile_75:.2f} {bm.unit}\")\n                lines.append(f\"- **Sample Size:** {bm.sample_size}\")\n\n        # Escalation\n        lines.append(\"\\n## Cost Escalation\")\n        esc = self.calculate_escalation(from_year=2020, to_year=2026)\n        if esc:\n            lines.append(f\"- **Period:** {esc.from_year} to {esc.to_year}\")\n            lines.append(f\"- **Annual Rate:** {esc.annual_rate:.1%}\")\n            lines.append(f\"- **Total Change:** {esc.total_change:.1%}\")\n\n        # Cost Drivers\n        drivers = self.identify_cost_drivers()\n        if drivers:\n            lines.append(\"\\n## Key Cost Drivers\")\n            for driver in drivers[:5]:\n                lines.append(f\"- **{driver.factor}:** {driver.impact_percentage:.1f}% impact (r={driver.correlation:.2f})\")\n\n        # Overrun Analysis\n        overrun_analysis = self.analyze_overrun_patterns()\n        if 'overall' in overrun_analysis:\n            lines.append(\"\\n## Overrun Analysis\")\n            overall = overrun_analysis['overall']\n            lines.append(f\"- **Average Overrun:** {overall['mean']:.1f}%\")\n            lines.append(f\"- **Projects Over Budget:** {overall['pct_over_budget']:.1f}%\")\n\n        return \"\\n\".join(lines)"
      },
      {
        "title": "Quick Start",
        "body": "import pandas as pd\n\n# Load historical data\nhistorical = pd.read_excel(\"historical_projects.xlsx\")\n\n# Initialize analyzer\nanalyzer = HistoricalCostAnalyzer()\nanalyzer.load_data(historical)\n\n# Calculate benchmarks for office buildings\nbenchmarks = analyzer.calculate_benchmarks(project_type='Office')\nprint(f\"Office median cost: ${benchmarks['cost_per_sf'].value:.2f}/SF\")\n\n# Calculate escalation\nescalation = analyzer.calculate_escalation(from_year=2020, to_year=2026)\nprint(f\"Annual escalation: {escalation.annual_rate:.1%}\")\n\n# Find similar projects\nsimilar = analyzer.find_similar_projects({\n    'project_type': 'Office',\n    'gross_area': 50000,\n    'year_range': (2020, 2025)\n})\nprint(f\"Found {len(similar)} similar projects\")\n\n# Compare estimate to benchmark\ncomparison = analyzer.compare_to_benchmark({'cost_per_sf': 250}, 'Office')\nprint(f\"Estimate percentile: {comparison['cost_per_sf']['percentile']:.0f}th\")\n\n# Generate report\nreport = analyzer.generate_report('Office')\nprint(report)"
      },
      {
        "title": "Dependencies",
        "body": "pip install pandas numpy scipy"
      }
    ],
    "body": "Historical Cost Analyzer for Construction\nOverview\n\nAnalyze historical construction cost data for benchmarking, escalation tracking, and estimating calibration. Compare similar projects, identify cost drivers, and improve future estimates.\n\nBusiness Case\n\nHistorical cost analysis enables:\n\nBenchmarking: Compare current estimates to past projects\nCalibration: Improve estimating accuracy using actual data\nTrends: Track cost escalation and market changes\nRisk Assessment: Identify cost drivers and overrun patterns\nTechnical Implementation\nfrom dataclasses import dataclass, field\nfrom typing import List, Dict, Any, Optional, Tuple\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime\nfrom scipy import stats\n\n@dataclass\nclass CostBenchmark:\n    metric_name: str\n    value: float\n    unit: str\n    percentile_25: float\n    percentile_50: float\n    percentile_75: float\n    sample_size: int\n    project_types: List[str]\n\n@dataclass\nclass EscalationAnalysis:\n    from_year: int\n    to_year: int\n    annual_rate: float\n    total_change: float\n    category: str\n    confidence: float\n\n@dataclass\nclass CostDriver:\n    factor: str\n    impact_percentage: float\n    correlation: float\n    description: str\n\nclass HistoricalCostAnalyzer:\n    \"\"\"Analyze historical construction costs.\"\"\"\n\n    # RSMeans City Cost Indexes (sample - would be loaded from database)\n    LOCATION_FACTORS = {\n        'New York': 1.32, 'San Francisco': 1.28, 'Los Angeles': 1.15,\n        'Chicago': 1.12, 'Houston': 0.92, 'Dallas': 0.89,\n        'Phoenix': 0.93, 'Atlanta': 0.91, 'Denver': 1.02,\n        'Seattle': 1.08, 'National Average': 1.00\n    }\n\n    # Historical cost indices by year\n    COST_INDICES = {\n        2015: 100.0, 2016: 102.1, 2017: 105.3, 2018: 109.2,\n        2019: 112.5, 2020: 114.8, 2021: 121.4, 2022: 135.6,\n        2023: 142.3, 2024: 148.7, 2025: 154.2, 2026: 160.0\n    }\n\n    def __init__(self, historical_data: pd.DataFrame = None):\n        self.data = historical_data\n        self.benchmarks: Dict[str, CostBenchmark] = {}\n\n    def load_data(self, data: pd.DataFrame):\n        \"\"\"Load historical project data.\"\"\"\n        self.data = data.copy()\n\n        # Normalize data\n        if 'completion_year' not in self.data.columns and 'completion_date' in self.data.columns:\n            self.data['completion_year'] = pd.to_datetime(self.data['completion_date']).dt.year\n\n        # Calculate key metrics\n        if 'gross_area' in self.data.columns and 'final_cost' in self.data.columns:\n            self.data['cost_per_sf'] = self.data['final_cost'] / self.data['gross_area']\n\n        if 'original_estimate' in self.data.columns and 'final_cost' in self.data.columns:\n            self.data['overrun_pct'] = ((self.data['final_cost'] - self.data['original_estimate'])\n                                         / self.data['original_estimate'] * 100)\n\n    def normalize_to_year(self, costs: pd.Series, from_years: pd.Series,\n                          to_year: int = 2026) -> pd.Series:\n        \"\"\"Normalize costs to a common year using cost indices.\"\"\"\n        normalized = costs.copy()\n\n        for i, (cost, year) in enumerate(zip(costs, from_years)):\n            if pd.notna(cost) and pd.notna(year):\n                year = int(year)\n                if year in self.COST_INDICES and to_year in self.COST_INDICES:\n                    factor = self.COST_INDICES[to_year] / self.COST_INDICES[year]\n                    normalized.iloc[i] = cost * factor\n\n        return normalized\n\n    def normalize_to_location(self, costs: pd.Series, locations: pd.Series,\n                               to_location: str = 'National Average') -> pd.Series:\n        \"\"\"Normalize costs to a common location.\"\"\"\n        normalized = costs.copy()\n        to_factor = self.LOCATION_FACTORS.get(to_location, 1.0)\n\n        for i, (cost, loc) in enumerate(zip(costs, locations)):\n            if pd.notna(cost) and loc in self.LOCATION_FACTORS:\n                from_factor = self.LOCATION_FACTORS[loc]\n                normalized.iloc[i] = cost * (to_factor / from_factor)\n\n        return normalized\n\n    def calculate_benchmarks(self, project_type: str = None,\n                              year_range: Tuple[int, int] = None) -> Dict[str, CostBenchmark]:\n        \"\"\"Calculate cost benchmarks from historical data.\"\"\"\n        df = self.data.copy()\n\n        # Filter by project type\n        if project_type and 'project_type' in df.columns:\n            df = df[df['project_type'] == project_type]\n\n        # Filter by year range\n        if year_range and 'completion_year' in df.columns:\n            df = df[(df['completion_year'] >= year_range[0]) &\n                    (df['completion_year'] <= year_range[1])]\n\n        benchmarks = {}\n\n        # Cost per SF\n        if 'cost_per_sf' in df.columns:\n            values = df['cost_per_sf'].dropna()\n            if len(values) > 0:\n                benchmarks['cost_per_sf'] = CostBenchmark(\n                    metric_name='Cost per SF',\n                    value=values.median(),\n                    unit='$/SF',\n                    percentile_25=values.quantile(0.25),\n                    percentile_50=values.quantile(0.50),\n                    percentile_75=values.quantile(0.75),\n                    sample_size=len(values),\n                    project_types=[project_type] if project_type else df['project_type'].unique().tolist()\n                )\n\n        # Overrun percentage\n        if 'overrun_pct' in df.columns:\n            values = df['overrun_pct'].dropna()\n            if len(values) > 0:\n                benchmarks['overrun_pct'] = CostBenchmark(\n                    metric_name='Cost Overrun',\n                    value=values.median(),\n                    unit='%',\n                    percentile_25=values.quantile(0.25),\n                    percentile_50=values.quantile(0.50),\n                    percentile_75=values.quantile(0.75),\n                    sample_size=len(values),\n                    project_types=[project_type] if project_type else df['project_type'].unique().tolist()\n                )\n\n        self.benchmarks.update(benchmarks)\n        return benchmarks\n\n    def calculate_escalation(self, category: str = 'overall',\n                              from_year: int = 2020,\n                              to_year: int = 2026) -> EscalationAnalysis:\n        \"\"\"Calculate cost escalation between years.\"\"\"\n        if from_year in self.COST_INDICES and to_year in self.COST_INDICES:\n            from_index = self.COST_INDICES[from_year]\n            to_index = self.COST_INDICES[to_year]\n\n            total_change = (to_index - from_index) / from_index\n            years = to_year - from_year\n            annual_rate = (to_index / from_index) ** (1 / years) - 1 if years > 0 else 0\n\n            return EscalationAnalysis(\n                from_year=from_year,\n                to_year=to_year,\n                annual_rate=annual_rate,\n                total_change=total_change,\n                category=category,\n                confidence=0.95\n            )\n\n        return None\n\n    def identify_cost_drivers(self, target_col: str = 'cost_per_sf') -> List[CostDriver]:\n        \"\"\"Identify factors that drive costs.\"\"\"\n        if self.data is None or target_col not in self.data.columns:\n            return []\n\n        drivers = []\n        target = self.data[target_col].dropna()\n\n        # Analyze numeric columns\n        numeric_cols = self.data.select_dtypes(include=[np.number]).columns\n        exclude = [target_col, 'final_cost', 'original_estimate']\n\n        for col in numeric_cols:\n            if col not in exclude:\n                valid_mask = self.data[col].notna() & self.data[target_col].notna()\n                if valid_mask.sum() > 10:\n                    corr, p_value = stats.pearsonr(\n                        self.data.loc[valid_mask, col],\n                        self.data.loc[valid_mask, target_col]\n                    )\n\n                    if abs(corr) > 0.3 and p_value < 0.05:\n                        impact = corr * self.data[col].std() / target.std() * 100\n\n                        drivers.append(CostDriver(\n                            factor=col,\n                            impact_percentage=abs(impact),\n                            correlation=corr,\n                            description=f\"{'Positive' if corr > 0 else 'Negative'} correlation with {target_col}\"\n                        ))\n\n        # Analyze categorical columns\n        categorical_cols = self.data.select_dtypes(include=['object', 'category']).columns\n\n        for col in categorical_cols:\n            if col not in ['project_id', 'project_name']:\n                groups = self.data.groupby(col)[target_col].mean()\n                if len(groups) > 1:\n                    variance = groups.var()\n                    overall_var = target.var()\n\n                    if variance / overall_var > 0.1:\n                        drivers.append(CostDriver(\n                            factor=col,\n                            impact_percentage=variance / overall_var * 100,\n                            correlation=0,\n                            description=f\"Categorical factor with significant cost variation\"\n                        ))\n\n        return sorted(drivers, key=lambda x: -x.impact_percentage)\n\n    def compare_to_benchmark(self, estimate: Dict, project_type: str = None) -> Dict:\n        \"\"\"Compare an estimate to historical benchmarks.\"\"\"\n        if project_type:\n            self.calculate_benchmarks(project_type)\n\n        comparison = {}\n\n        # Cost per SF comparison\n        if 'cost_per_sf' in estimate and 'cost_per_sf' in self.benchmarks:\n            benchmark = self.benchmarks['cost_per_sf']\n            value = estimate['cost_per_sf']\n\n            percentile = stats.percentileofscore(\n                self.data['cost_per_sf'].dropna(), value\n            )\n\n            comparison['cost_per_sf'] = {\n                'estimate': value,\n                'benchmark_median': benchmark.value,\n                'benchmark_range': (benchmark.percentile_25, benchmark.percentile_75),\n                'percentile': percentile,\n                'status': 'within_range' if benchmark.percentile_25 <= value <= benchmark.percentile_75 else 'outside_range'\n            }\n\n        return comparison\n\n    def find_similar_projects(self, criteria: Dict, n: int = 10) -> pd.DataFrame:\n        \"\"\"Find similar historical projects.\"\"\"\n        df = self.data.copy()\n\n        # Filter by criteria\n        if 'project_type' in criteria:\n            df = df[df['project_type'] == criteria['project_type']]\n\n        if 'gross_area' in criteria:\n            target = criteria['gross_area']\n            tolerance = criteria.get('area_tolerance', 0.3)\n            df = df[(df['gross_area'] >= target * (1 - tolerance)) &\n                    (df['gross_area'] <= target * (1 + tolerance))]\n\n        if 'location' in criteria and 'location' in df.columns:\n            df = df[df['location'] == criteria['location']]\n\n        if 'year_range' in criteria:\n            df = df[(df['completion_year'] >= criteria['year_range'][0]) &\n                    (df['completion_year'] <= criteria['year_range'][1])]\n\n        # Sort by similarity (simple: by area difference)\n        if 'gross_area' in criteria and 'gross_area' in df.columns:\n            df['similarity'] = 1 - abs(df['gross_area'] - criteria['gross_area']) / criteria['gross_area']\n            df = df.sort_values('similarity', ascending=False)\n\n        return df.head(n)\n\n    def analyze_overrun_patterns(self) -> Dict:\n        \"\"\"Analyze patterns in cost overruns.\"\"\"\n        if 'overrun_pct' not in self.data.columns:\n            return {}\n\n        analysis = {}\n\n        # Overall statistics\n        overruns = self.data['overrun_pct'].dropna()\n        analysis['overall'] = {\n            'mean': overruns.mean(),\n            'median': overruns.median(),\n            'std': overruns.std(),\n            'projects_over_budget': (overruns > 0).sum(),\n            'projects_under_budget': (overruns < 0).sum(),\n            'pct_over_budget': (overruns > 0).mean() * 100\n        }\n\n        # By project type\n        if 'project_type' in self.data.columns:\n            by_type = self.data.groupby('project_type')['overrun_pct'].agg(['mean', 'std', 'count'])\n            analysis['by_type'] = by_type.to_dict('index')\n\n        # By size category\n        if 'gross_area' in self.data.columns:\n            self.data['size_category'] = pd.cut(\n                self.data['gross_area'],\n                bins=[0, 10000, 50000, 100000, np.inf],\n                labels=['Small (<10k SF)', 'Medium (10-50k SF)', 'Large (50-100k SF)', 'Very Large (>100k SF)']\n            )\n            by_size = self.data.groupby('size_category')['overrun_pct'].agg(['mean', 'std', 'count'])\n            analysis['by_size'] = by_size.to_dict('index')\n\n        return analysis\n\n    def generate_report(self, project_type: str = None) -> str:\n        \"\"\"Generate comprehensive cost analysis report.\"\"\"\n        lines = [\"# Historical Cost Analysis Report\", \"\"]\n        lines.append(f\"**Generated:** {datetime.now().strftime('%Y-%m-%d')}\")\n        lines.append(f\"**Projects Analyzed:** {len(self.data):,}\")\n        if project_type:\n            lines.append(f\"**Project Type:** {project_type}\")\n        lines.append(\"\")\n\n        # Benchmarks\n        benchmarks = self.calculate_benchmarks(project_type)\n        if benchmarks:\n            lines.append(\"## Cost Benchmarks\")\n            for name, bm in benchmarks.items():\n                lines.append(f\"\\n### {bm.metric_name}\")\n                lines.append(f\"- **Median:** {bm.value:.2f} {bm.unit}\")\n                lines.append(f\"- **25th Percentile:** {bm.percentile_25:.2f} {bm.unit}\")\n                lines.append(f\"- **75th Percentile:** {bm.percentile_75:.2f} {bm.unit}\")\n                lines.append(f\"- **Sample Size:** {bm.sample_size}\")\n\n        # Escalation\n        lines.append(\"\\n## Cost Escalation\")\n        esc = self.calculate_escalation(from_year=2020, to_year=2026)\n        if esc:\n            lines.append(f\"- **Period:** {esc.from_year} to {esc.to_year}\")\n            lines.append(f\"- **Annual Rate:** {esc.annual_rate:.1%}\")\n            lines.append(f\"- **Total Change:** {esc.total_change:.1%}\")\n\n        # Cost Drivers\n        drivers = self.identify_cost_drivers()\n        if drivers:\n            lines.append(\"\\n## Key Cost Drivers\")\n            for driver in drivers[:5]:\n                lines.append(f\"- **{driver.factor}:** {driver.impact_percentage:.1f}% impact (r={driver.correlation:.2f})\")\n\n        # Overrun Analysis\n        overrun_analysis = self.analyze_overrun_patterns()\n        if 'overall' in overrun_analysis:\n            lines.append(\"\\n## Overrun Analysis\")\n            overall = overrun_analysis['overall']\n            lines.append(f\"- **Average Overrun:** {overall['mean']:.1f}%\")\n            lines.append(f\"- **Projects Over Budget:** {overall['pct_over_budget']:.1f}%\")\n\n        return \"\\n\".join(lines)\n\nQuick Start\nimport pandas as pd\n\n# Load historical data\nhistorical = pd.read_excel(\"historical_projects.xlsx\")\n\n# Initialize analyzer\nanalyzer = HistoricalCostAnalyzer()\nanalyzer.load_data(historical)\n\n# Calculate benchmarks for office buildings\nbenchmarks = analyzer.calculate_benchmarks(project_type='Office')\nprint(f\"Office median cost: ${benchmarks['cost_per_sf'].value:.2f}/SF\")\n\n# Calculate escalation\nescalation = analyzer.calculate_escalation(from_year=2020, to_year=2026)\nprint(f\"Annual escalation: {escalation.annual_rate:.1%}\")\n\n# Find similar projects\nsimilar = analyzer.find_similar_projects({\n    'project_type': 'Office',\n    'gross_area': 50000,\n    'year_range': (2020, 2025)\n})\nprint(f\"Found {len(similar)} similar projects\")\n\n# Compare estimate to benchmark\ncomparison = analyzer.compare_to_benchmark({'cost_per_sf': 250}, 'Office')\nprint(f\"Estimate percentile: {comparison['cost_per_sf']['percentile']:.0f}th\")\n\n# Generate report\nreport = analyzer.generate_report('Office')\nprint(report)\n\nDependencies\npip install pandas numpy scipy"
  },
  "trust": {
    "sourceLabel": "tencent",
    "provenanceUrl": "https://clawhub.ai/datadrivenconstruction/historical-cost-analyzer",
    "publisherUrl": "https://clawhub.ai/datadrivenconstruction/historical-cost-analyzer",
    "owner": "datadrivenconstruction",
    "version": "2.0.0",
    "license": null,
    "verificationStatus": "Indexed source record"
  },
  "links": {
    "detailUrl": "https://openagent3.xyz/skills/historical-cost-analyzer",
    "downloadUrl": "https://openagent3.xyz/downloads/historical-cost-analyzer",
    "agentUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent",
    "manifestUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent.json",
    "briefUrl": "https://openagent3.xyz/skills/historical-cost-analyzer/agent.md"
  }
}