{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IESO Coincident Peak Prediction — Feature Engineering\n",
    "\n",
    "This notebook transforms raw demand and weather data into ML-ready features.\n",
    "Feature categories:\n",
    "- **Weather features:** humidex, cooling degree hours, rolling averages\n",
    "- **Temporal features:** calendar indicators, Ontario holidays\n",
    "- **Demand momentum:** lagged demand, rolling windows\n",
    "- **Peak context:** running threshold tracker, peaks accumulated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "from pathlib import Path\n",
    "from sklearn.feature_selection import mutual_info_regression\n",
    "\n",
    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
    "sns.set_theme(style='whitegrid', font_scale=1.1)\n",
    "\n",
    "# Paths\n",
    "PROJECT_ROOT = Path(r'C:/wamp64/www/Spec_Driven_Dev_Website')\n",
    "DATA_DIR = PROJECT_ROOT / 'notebooks' / 'source' / 'data'\n",
    "\n",
    "print(f'Loading data from: {DATA_DIR}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load cleaned hourly dataset from Notebook 1\n",
    "hourly = pd.read_parquet(DATA_DIR / 'ieso_demand_weather_2010_2025.parquet')\n",
    "peaks = pd.read_parquet(DATA_DIR / 'ieso_peak_labels.parquet')\n",
    "\n",
    "hourly['Date'] = pd.to_datetime(hourly['Date'])\n",
    "hourly['datetime'] = pd.to_datetime(hourly['datetime'])\n",
    "peaks['date'] = pd.to_datetime(peaks['date'])\n",
    "\n",
    "print(f'Hourly dataset: {len(hourly):,} rows')\n",
    "print(f'Peak labels: {len(peaks)} records across {peaks[\"base_period\"].nunique()} base periods')\n",
    "print(f'Date range: {hourly[\"Date\"].min()} to {hourly[\"Date\"].max()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Temporal Features\n",
    "\n",
    "Calendar indicators including Ontario statutory holidays. Holidays behave\n",
    "like weekends for demand purposes — commercial and industrial loads drop."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ontario statutory holidays (fixed-date and rule-based)\n",
    "def get_ontario_holidays(year):\n",
    "    \"\"\"Generate Ontario statutory holiday dates for a given year.\"\"\"\n",
    "    from datetime import date\n",
    "    import calendar\n",
    "    \n",
    "    holidays = []\n",
    "    \n",
    "    # Fixed-date holidays\n",
    "    holidays.append(date(year, 1, 1))   # New Year's Day\n",
    "    holidays.append(date(year, 7, 1))   # Canada Day\n",
    "    holidays.append(date(year, 12, 25)) # Christmas Day\n",
    "    holidays.append(date(year, 12, 26)) # Boxing Day\n",
    "    \n",
    "    # Family Day: 3rd Monday of February\n",
    "    c = calendar.monthcalendar(year, 2)\n",
    "    mondays = [week[calendar.MONDAY] for week in c if week[calendar.MONDAY] != 0]\n",
    "    holidays.append(date(year, 2, mondays[2]))\n",
    "    \n",
    "    # Good Friday: 2 days before Easter Sunday\n",
    "    # Easter calculation (Anonymous Gregorian algorithm)\n",
    "    a = year % 19\n",
    "    b = year // 100\n",
    "    c_val = year % 100\n",
    "    d = b // 4\n",
    "    e = b % 4\n",
    "    f = (b + 8) // 25\n",
    "    g = (b - f + 1) // 3\n",
    "    h = (19 * a + b - d - g + 15) % 30\n",
    "    i = c_val // 4\n",
    "    k = c_val % 4\n",
    "    l = (32 + 2 * e + 2 * i - h - k) % 7\n",
    "    m = (a + 11 * h + 22 * l) // 451\n",
    "    month = (h + l - 7 * m + 114) // 31\n",
    "    day = ((h + l - 7 * m + 114) % 31) + 1\n",
    "    easter = date(year, month, day)\n",
    "    good_friday = easter - timedelta(days=2)\n",
    "    holidays.append(good_friday)\n",
    "    \n",
    "    # Victoria Day: Monday before May 25\n",
    "    may25 = date(year, 5, 25)\n",
    "    days_since_monday = (may25.weekday()) % 7\n",
    "    victoria_day = may25 - timedelta(days=days_since_monday) if may25.weekday() != 0 else may25\n",
    "    holidays.append(victoria_day)\n",
    "    \n",
    "    # Civic Holiday: 1st Monday of August\n",
    "    c = calendar.monthcalendar(year, 8)\n",
    "    mondays = [week[calendar.MONDAY] for week in c if week[calendar.MONDAY] != 0]\n",
    "    holidays.append(date(year, 8, mondays[0]))\n",
    "    \n",
    "    # Labour Day: 1st Monday of September\n",
    "    c = calendar.monthcalendar(year, 9)\n",
    "    mondays = [week[calendar.MONDAY] for week in c if week[calendar.MONDAY] != 0]\n",
    "    holidays.append(date(year, 9, mondays[0]))\n",
    "    \n",
    "    # Thanksgiving: 2nd Monday of October\n",
    "    c = calendar.monthcalendar(year, 10)\n",
    "    mondays = [week[calendar.MONDAY] for week in c if week[calendar.MONDAY] != 0]\n",
    "    holidays.append(date(year, 10, mondays[1]))\n",
    "    \n",
    "    return [pd.Timestamp(h) for h in holidays]\n",
    "\n",
    "from datetime import timedelta\n",
    "\n",
    "# Build holiday set for all years\n",
    "all_holidays = set()\n",
    "for year in range(2010, 2027):\n",
    "    all_holidays.update(get_ontario_holidays(year))\n",
    "\n",
    "# Add temporal features\n",
    "hourly['month'] = hourly['Date'].dt.month\n",
    "hourly['day_of_week'] = hourly['Date'].dt.dayofweek  # 0=Monday, 6=Sunday\n",
    "hourly['day_of_year'] = hourly['Date'].dt.dayofyear\n",
    "hourly['week_of_year'] = hourly['Date'].dt.isocalendar().week.astype(int)\n",
    "hourly['is_weekday'] = (hourly['day_of_week'] < 5).astype(int)\n",
    "hourly['is_ontario_holiday'] = hourly['Date'].isin(all_holidays).astype(int)\n",
    "# Effective business day: weekday AND not a holiday\n",
    "hourly['is_business_day'] = ((hourly['is_weekday'] == 1) & \n",
    "                              (hourly['is_ontario_holiday'] == 0)).astype(int)\n",
    "\n",
    "print(f'Temporal features added.')\n",
    "print(f'Ontario holidays identified: {hourly[\"is_ontario_holiday\"].sum()} hourly records')\n",
    "print(f'Business days: {hourly[\"is_business_day\"].sum():,} hours '\n",
    "      f'({hourly[\"is_business_day\"].mean()*100:.1f}%)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Weather Features\n",
    "\n",
    "Transform raw temperature and humidity into domain-relevant indicators:\n",
    "- **Humidex:** perceived temperature accounting for humidity\n",
    "- **Cooling Degree Hours (CDH):** cumulative heat exposure above 18°C\n",
    "- **Rolling averages:** multi-day thermal mass effects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_humidex(temp_c, dewpoint_c):\n",
    "    \"\"\"Calculate humidex from temperature and dewpoint.\n",
    "    \n",
    "    H = T + (5/9) * (6.11 * exp(5417.7530 * (1/273.16 - 1/(273.15+Td))) - 10)\n",
    "    \"\"\"\n",
    "    e = 6.11 * np.exp(5417.7530 * (1.0/273.16 - 1.0/(273.15 + dewpoint_c)))\n",
    "    h = temp_c + (5.0/9.0) * (e - 10.0)\n",
    "    # Humidex is only meaningful when T > 20°C\n",
    "    return np.where(temp_c >= 20, h, temp_c)\n",
    "\n",
    "# Hourly weather features\n",
    "hourly['humidex'] = compute_humidex(\n",
    "    hourly['temperature_c'].values, \n",
    "    hourly['dewpoint_c'].values\n",
    ")\n",
    "\n",
    "# Cooling Degree Hours: max(0, T - 18) per hour\n",
    "hourly['cdh'] = np.maximum(0, hourly['temperature_c'] - 18.0)\n",
    "\n",
    "# Daily aggregations\n",
    "daily_weather = hourly.groupby('Date').agg({\n",
    "    'temperature_c': ['max', 'mean', 'min'],\n",
    "    'humidex': 'max',\n",
    "    'dewpoint_c': 'mean',\n",
    "    'relative_humidity': 'mean',\n",
    "    'cdh': 'sum',  # daily CDH = sum of hourly CDH\n",
    "    'shortwave_radiation': 'sum',  # daily solar insolation\n",
    "}).reset_index()\n",
    "daily_weather.columns = ['Date', 'daily_max_temp', 'daily_mean_temp', 'daily_min_temp',\n",
    "                          'daily_max_humidex', 'daily_mean_dewpoint',\n",
    "                          'daily_mean_rh', 'daily_cdh', 'daily_solar']\n",
    "\n",
    "# Rolling averages (thermal mass effects)\n",
    "daily_weather = daily_weather.sort_values('Date').reset_index(drop=True)\n",
    "daily_weather['temp_3day_avg'] = daily_weather['daily_max_temp'].rolling(3, min_periods=1).mean()\n",
    "daily_weather['temp_5day_avg'] = daily_weather['daily_max_temp'].rolling(5, min_periods=1).mean()\n",
    "daily_weather['cdh_3day_avg'] = daily_weather['daily_cdh'].rolling(3, min_periods=1).mean()\n",
    "\n",
    "print(f'Daily weather features: {len(daily_weather)} days')\n",
    "print(f'\\nFeature statistics (summer months only):')\n",
    "summer_mask = daily_weather['Date'].dt.month.isin([6, 7, 8])\n",
    "print(daily_weather.loc[summer_mask, ['daily_max_temp', 'daily_max_humidex', 'daily_cdh']].describe().round(1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demand Momentum Features\n",
    "\n",
    "Lagged and rolling demand features capture system-level momentum.\n",
    "These are available by the morning of the prediction day."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Daily demand summary\n",
    "daily_demand = hourly.groupby('Date').agg({\n",
    "    'Ontario Demand': ['max', 'mean'],\n",
    "    'base_period': 'first',\n",
    "    'is_business_day': 'first',\n",
    "    'is_weekday': 'first',\n",
    "    'is_ontario_holiday': 'first',\n",
    "    'month': 'first',\n",
    "    'day_of_week': 'first',\n",
    "    'day_of_year': 'first',\n",
    "}).reset_index()\n",
    "daily_demand.columns = ['Date', 'daily_max_demand', 'daily_mean_demand',\n",
    "                         'base_period', 'is_business_day', 'is_weekday',\n",
    "                         'is_ontario_holiday', 'month', 'day_of_week', 'day_of_year']\n",
    "\n",
    "# Morning demand ramp: demand at Hour 10 (10 AM) as early predictor\n",
    "morning_demand = hourly[hourly['Hour'] == 10][['Date', 'Ontario Demand']].copy()\n",
    "morning_demand.columns = ['Date', 'demand_at_10am']\n",
    "daily_demand = daily_demand.merge(morning_demand, on='Date', how='left')\n",
    "\n",
    "# Demand at Hour 13 (1 PM) for intraday update\n",
    "afternoon_demand = hourly[hourly['Hour'] == 13][['Date', 'Ontario Demand']].copy()\n",
    "afternoon_demand.columns = ['Date', 'demand_at_1pm']\n",
    "daily_demand = daily_demand.merge(afternoon_demand, on='Date', how='left')\n",
    "\n",
    "# Lagged features (shifted by 1 day so they're available in the morning)\n",
    "daily_demand = daily_demand.sort_values('Date').reset_index(drop=True)\n",
    "daily_demand['prev_day_max_demand'] = daily_demand['daily_max_demand'].shift(1)\n",
    "daily_demand['prev_day_mean_demand'] = daily_demand['daily_mean_demand'].shift(1)\n",
    "\n",
    "# Rolling windows\n",
    "daily_demand['rolling_7d_max_demand'] = (\n",
    "    daily_demand['daily_max_demand'].shift(1).rolling(7, min_periods=1).max()\n",
    ")\n",
    "daily_demand['rolling_7d_mean_demand'] = (\n",
    "    daily_demand['daily_max_demand'].shift(1).rolling(7, min_periods=1).mean()\n",
    ")\n",
    "\n",
    "# Year-over-year growth (same month, previous year average)\n",
    "monthly_avg = daily_demand.groupby([daily_demand['Date'].dt.year, 'month'])['daily_max_demand'].mean()\n",
    "monthly_avg = monthly_avg.reset_index()\n",
    "monthly_avg.columns = ['year', 'month', 'monthly_avg_demand']\n",
    "\n",
    "print(f'Daily demand features: {len(daily_demand)} days')\n",
    "print(f'\\nLagged feature sample:')\n",
    "print(daily_demand[['Date', 'daily_max_demand', 'prev_day_max_demand', \n",
    "                     'rolling_7d_max_demand']].dropna().tail(5).to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Peak Context Features\n",
    "\n",
    "These features are unique to the coincident peak prediction problem. They track\n",
    "the running top-5 peak state within each base period, providing the displacement\n",
    "threshold that defines whether a day could produce a new peak.\n",
    "\n",
    "**Critical:** these are computed sequentially within each base period to prevent\n",
    "look-ahead bias."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build peak context features within each base period\n",
    "# Using ground truth peak labels from the archive\n",
    "top5 = peaks[peaks['rank'] <= 5].copy()\n",
    "\n",
    "def compute_peak_context(daily_df, peaks_df):\n",
    "    \"\"\"Compute running peak tracker features for each base period.\n",
    "    \n",
    "    For each day, computes:\n",
    "    - current_5th_peak: the 5th-highest demand seen so far in this base period\n",
    "    - peaks_so_far: how many of the eventual top-5 peaks have occurred\n",
    "    - days_since_last_peak: days since the last top-5 peak in this base period\n",
    "    - max_demand_so_far: highest daily max demand in this base period to date\n",
    "    \"\"\"\n",
    "    results = []\n",
    "    \n",
    "    for bp in sorted(daily_df['base_period'].unique()):\n",
    "        bp_mask = daily_df['base_period'] == bp\n",
    "        bp_days = daily_df[bp_mask].sort_values('Date').copy()\n",
    "        \n",
    "        # Get actual peak dates for this base period\n",
    "        bp_peaks = peaks_df[peaks_df['base_period'] == bp]\n",
    "        peak_dates = set(bp_peaks['date'].dt.date) if len(bp_peaks) > 0 else set()\n",
    "        \n",
    "        running_top5 = []  # Track top-5 demands seen so far\n",
    "        last_peak_date = None\n",
    "        \n",
    "        for idx, row in bp_days.iterrows():\n",
    "            current_date = row['Date'].date() if hasattr(row['Date'], 'date') else row['Date']\n",
    "            \n",
    "            # Current threshold (5th highest demand seen so far)\n",
    "            if len(running_top5) >= 5:\n",
    "                current_5th = sorted(running_top5, reverse=True)[4]\n",
    "            elif len(running_top5) > 0:\n",
    "                current_5th = min(running_top5)\n",
    "            else:\n",
    "                current_5th = 0\n",
    "            \n",
    "            # Days since last peak\n",
    "            if last_peak_date is not None:\n",
    "                days_since = (pd.Timestamp(current_date) - last_peak_date).days\n",
    "            else:\n",
    "                days_since = -1  # No peak yet this base period\n",
    "            \n",
    "            # Max demand so far\n",
    "            max_so_far = max(running_top5) if running_top5 else 0\n",
    "            \n",
    "            results.append({\n",
    "                'Date': row['Date'],\n",
    "                'current_5th_peak': current_5th,\n",
    "                'peaks_so_far': sum(1 for d in running_top5 \n",
    "                                   if d >= (sorted(running_top5, reverse=True)[4] \n",
    "                                           if len(running_top5) >= 5 else 0)),\n",
    "                'num_top5_seen': min(len(running_top5), 5),\n",
    "                'days_since_last_peak': days_since,\n",
    "                'max_demand_so_far': max_so_far,\n",
    "            })\n",
    "            \n",
    "            # Update running tracker AFTER computing features for today\n",
    "            # (prevents look-ahead bias)\n",
    "            demand_today = row['daily_max_demand']\n",
    "            if not np.isnan(demand_today):\n",
    "                running_top5.append(demand_today)\n",
    "                running_top5 = sorted(running_top5, reverse=True)[:10]  # Keep top 10\n",
    "            \n",
    "            # Track if today was a peak day\n",
    "            if current_date in peak_dates:\n",
    "                last_peak_date = pd.Timestamp(current_date)\n",
    "    \n",
    "    return pd.DataFrame(results)\n",
    "\n",
    "peak_context = compute_peak_context(daily_demand, top5)\n",
    "print(f'Peak context features computed: {len(peak_context)} days')\n",
    "print(f'\\nSample (summer 2024):')\n",
    "sample_mask = (peak_context['Date'] >= '2024-06-01') & (peak_context['Date'] <= '2024-08-31')\n",
    "print(peak_context[sample_mask].head(10).to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Target Variable Construction\n",
    "\n",
    "Create the regression target (daily max demand) and binary classification labels\n",
    "(is this a top-5 peak day?) from ground truth data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Mark peak days in the daily dataset\n",
    "peak_dates_set = set(top5['date'].dt.date)\n",
    "daily_demand['is_peak_day'] = daily_demand['Date'].apply(\n",
    "    lambda d: 1 if d.date() in peak_dates_set else 0\n",
    ")\n",
    "\n",
    "print(f'Peak day labels:')\n",
    "print(f'  Total peak days: {daily_demand[\"is_peak_day\"].sum()}')\n",
    "print(f'  Total non-peak days: {(daily_demand[\"is_peak_day\"] == 0).sum()}')\n",
    "print(f'  Peak day rate: {daily_demand[\"is_peak_day\"].mean()*100:.2f}%')\n",
    "print(f'\\nPeak days per base period:')\n",
    "print(daily_demand.groupby('base_period')['is_peak_day'].sum().to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Assemble Feature Matrix\n",
    "\n",
    "Merge all feature groups into a single daily-resolution feature matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge all features\n",
    "features = daily_demand.merge(daily_weather, on='Date', how='left')\n",
    "features = features.merge(peak_context, on='Date', how='left')\n",
    "\n",
    "# Drop rows with insufficient data (first few days of each base period)\n",
    "features = features.dropna(subset=['prev_day_max_demand', 'daily_max_temp']).copy()\n",
    "\n",
    "print(f'Feature matrix: {len(features)} days x {len(features.columns)} columns')\n",
    "print(f'\\nFeature columns:')\n",
    "for col in sorted(features.columns):\n",
    "    print(f'  {col}: {features[col].dtype}')\n",
    "\n",
    "print(f'\\nDate range: {features[\"Date\"].min()} to {features[\"Date\"].max()}')\n",
    "print(f'Base periods: {sorted(features[\"base_period\"].unique())}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Correlation Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select numeric features for correlation\n",
    "feature_cols = [\n",
    "    'daily_max_temp', 'daily_mean_temp', 'daily_max_humidex', 'daily_cdh',\n",
    "    'daily_mean_rh', 'daily_mean_dewpoint', 'temp_3day_avg', 'temp_5day_avg',\n",
    "    'cdh_3day_avg', 'daily_solar',\n",
    "    'month', 'day_of_week', 'is_business_day', 'day_of_year',\n",
    "    'prev_day_max_demand', 'rolling_7d_max_demand', 'rolling_7d_mean_demand',\n",
    "    'demand_at_10am',\n",
    "    'current_5th_peak', 'max_demand_so_far', 'days_since_last_peak',\n",
    "    'daily_max_demand'\n",
    "]\n",
    "\n",
    "corr_data = features[feature_cols].dropna()\n",
    "corr_matrix = corr_data.corr()\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(16, 14))\n",
    "mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
    "sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',\n",
    "            center=0, vmin=-1, vmax=1, square=True, ax=ax,\n",
    "            annot_kws={'fontsize': 7})\n",
    "ax.set_title('Feature Correlation Matrix', fontsize=14)\n",
    "plt.tight_layout()\n",
    "plt.savefig(DATA_DIR / 'feature_correlation_matrix.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "# Highlight high correlations with target\n",
    "target_corr = corr_matrix['daily_max_demand'].drop('daily_max_demand').sort_values(ascending=False)\n",
    "print('\\nCorrelation with daily_max_demand:')\n",
    "print(target_corr.to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Importance (Mutual Information)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Mutual information scores\n",
    "predictor_cols = [c for c in feature_cols if c != 'daily_max_demand']\n",
    "mi_data = features[predictor_cols + ['daily_max_demand']].dropna()\n",
    "\n",
    "X_mi = mi_data[predictor_cols].values\n",
    "y_mi = mi_data['daily_max_demand'].values\n",
    "\n",
    "mi_scores = mutual_info_regression(X_mi, y_mi, random_state=42)\n",
    "mi_df = pd.DataFrame({'Feature': predictor_cols, 'MI Score': mi_scores})\n",
    "mi_df = mi_df.sort_values('MI Score', ascending=True)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "colors = ['#d32f2f' if 'temp' in f or 'humidex' in f or 'cdh' in f or 'solar' in f\n",
    "          else '#1565C0' if 'demand' in f or 'peak' in f\n",
    "          else '#4CAF50' for f in mi_df['Feature']]\n",
    "ax.barh(range(len(mi_df)), mi_df['MI Score'], color=colors)\n",
    "ax.set_yticks(range(len(mi_df)))\n",
    "ax.set_yticklabels(mi_df['Feature'])\n",
    "ax.set_xlabel('Mutual Information Score')\n",
    "ax.set_title('Feature Importance for Daily Max Demand Prediction\\n'\n",
    "             '(Red=Weather, Blue=Demand, Green=Calendar)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(DATA_DIR / 'feature_importance_mi.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "print('\\nTop 10 features by mutual information:')\n",
    "print(mi_df.sort_values('MI Score', ascending=False).head(10).to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Feature-Engineered Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the complete feature matrix\n",
    "features.to_parquet(DATA_DIR / 'ieso_features_daily.parquet', index=False)\n",
    "print(f'Saved feature-engineered dataset: {len(features)} days x {len(features.columns)} columns')\n",
    "print(f'  -> {DATA_DIR / \"ieso_features_daily.parquet\"}')\n",
    "\n",
    "# Also save the hourly dataset with weather features for Notebook 5\n",
    "hourly_with_features = hourly.copy()\n",
    "hourly_with_features.to_parquet(DATA_DIR / 'ieso_hourly_with_features.parquet', index=False)\n",
    "print(f'\\nSaved hourly dataset with features: {len(hourly_with_features):,} rows')\n",
    "print(f'  -> {DATA_DIR / \"ieso_hourly_with_features.parquet\"}')\n",
    "\n",
    "print('\\n=== Notebook 2 complete ===')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
