c-ml-demand-eulp/process_hp_data_4.py at main · DeltaE/c-ml-demand-eulp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 22 15:17:22 2025

@author: luisfernando
"""

import pandas as pd
import os
import sys
import gc
import time
import joblib  # Import joblib for saving/loading model

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

# Scikit-learn models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function 1: evaluate all models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model",
                   feature_set_name="FeatureSet", target_name="Target",
                   output_dir='plots'
                   ):
    """
    Generic function to train, evaluate, and plot results for a regression model.

    Parameters:
        model: The regression model (e.g., LinearRegression(), RandomForestRegressor()).
        X_train: Training features.
        X_test: Testing features.
        y_train: Training target values.
        y_test: Testing target values.
        model_name: Name of the model for display purposes.
        feature_set_name: Name of the feature set for identification in the plot.
        target_name: Name of the target variable for identification in the plot.
        output_dir: Directory where plots will be saved.
    """
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Clip negative predictions to 0
    y_pred_clipped = np.maximum(y_pred, 0)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred_clipped)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred_clipped)
    r2 = r2_score(y_test, y_pred_clipped)

    # Print metrics
    # Prepare metrics string
    metrics_str = (
        f"\n{model_name} Metrics ({feature_set_name} -> {target_name}):\n"
        f"Mean Squared Error (MSE): {mse:.2f}\n"
        f"Root Mean Squared Error (RMSE): {rmse:.2f}\n"
        f"Mean Absolute Error (MAE): {mae:.2f}\n"
        f"R-squared (R²): {r2:.2f}\n"
    )

    # Print metrics to console
    print(metrics_str)

    '''
    The figure below takes too long, we better simplify that process
    '''
    '''
    # Plot True vs Predicted Values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred_clipped, alpha=0.7, label="Predictions")
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)],
             color='red', linestyle='--', label="Perfect Prediction")
    plt.xlabel("True Values (y_test)")
    plt.ylabel("Predicted Values (y_pred_clipped)")
    plt.title(f"True vs Predicted ({model_name}, {feature_set_name} -> {target_name})")
    plt.legend()
    plt.grid(True)
    plt.show()
    '''
    # ---- Limit the Data for Plotting ----
    # Select a subset for visualization (e.g., 1000 points or less)
    subset_size = min(50000, len(y_test))  # Limit to 1000 points if dataset is larger
    sampled_indices = np.random.choice(len(y_test), size=subset_size, replace=False)

    # Get the sampled true and predicted values
    y_test_sampled = np.array(y_test)[sampled_indices]
    y_pred_sampled = np.array(y_pred_clipped)[sampled_indices]

    # Plot True vs Predicted Values (sampled)
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_sampled, y_pred_sampled, alpha=0.7, label="Predictions (Sampled)")
    plt.plot([min(y_test_sampled), max(y_test_sampled)],
             [min(y_test_sampled), max(y_test_sampled)],
             color='red', linestyle='--', label="Perfect Prediction")
    plt.xlabel("True Values (y_test)")
    plt.ylabel("Predicted Values (y_pred_clipped)")
    plt.title(f"True vs Predicted ({model_name}, {feature_set_name} -> {target_name})")
    plt.legend()
    plt.grid(True)
    #plt.show()
    #plt.pause(0.001)  # Pause briefly to allow the figure to render

    print('Figure has plotted, now to printing')
    # Save the plot
    os.makedirs(output_dir, exist_ok=True)
    filename = f"{output_dir}/{model_name}_{feature_set_name}_{target_name}.png"
    plt.savefig(filename)
    print(f"Plot saved as: {filename}")
    plt.close()  # Close the plot to avoid display overlap
    # print('This happened?***')

    return metrics_str


# Function 2: save the model
def save_model(model, filename, compress=3):
    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        # Save with compression
        joblib.dump(model, filename, compress=compress)
        print(f"Model saved successfully: {filename}")
    except Exception as e:
        print(f"Error saving model {filename}: {e}")

'''
Start the process below.
'''
START_PROCESS = time.time()

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(max_depth=5, random_state=555),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=555),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=555),
    'SVR': SVR(kernel='rbf', C=1.0),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

# Initialize scaler
scaler = StandardScaler()

JSON_STR = "4"  # Replace with your desired string

# 1) Read the parquet files to process the model (check first):
directory = "./"

# parquet_files = [f for f in os.listdir(directory) if f.startswith("simple_parquet") and f.endswith(".parquet") and len(f.split('_')) == 3]
parquet_files = [f for f in os.listdir(directory) if f.startswith("simple_parquet") and f.endswith(".parquet") and JSON_STR in f.split('_')[-1]]

# Append multiple dataframes to print csv and obseve data.
BOOL_DF_APPEND = True
BOOL_VISUAL_DF_APPEND = False
dataframes = []

if BOOL_DF_APPEND:
    for file in parquet_files:
        try:
            # Read the Parquet file and filter for non-efficiency factors
            df = pd.read_parquet(directory + file)
            df_select = df[['timestamp',
                           'out.electricity.heating.energy_consumption',
                           'out.electricity.heating_hp_bkup.energy_consumption',
                           'Diff',
                           'sqft',
                           'representative_income',
                           'occupants',
                           'usage_level',
                           'rmb_heating_primary',
                           'rmb_heatpump_backup',
                            'vintage',
                            'rmb_window_area',
                            'duct_leakage_and_insulation',
                            'heating_setpoint',
                            'hvac_heating_efficiency']]

            # print('Stop here to check')
            # sys.exit()

            # Append the DataFrame to the list
            dataframes.append(df_select)
            print(f"Successfully read {file}")
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Concatenate them
    combined_df = pd.concat(dataframes, ignore_index=True)

    if BOOL_VISUAL_DF_APPEND:
        # Create the filename
        filename = f"all_states_{JSON_STR}.csv"

        # Save the DataFrame as a CSV
        combined_df.to_csv(filename, index=False)

# Explicitly delete the `dataframes` list
del dataframes

# Force garbage collection
gc.collect()

# Randomly sample 1 million rows (or all rows if fewer than 1M exist)
# num_samples = min(1_000_000, len(df))
num_samples = min(100_000, len(df))
df_sampled = combined_df.sample(n=num_samples, random_state=555)

# print('Stop here to print csv that allows good visualzations.')
# sys.exit()

# Define the feature sets
feature_sets = {
    "Set 1": [
        'out.electricity.heating.energy_consumption',
        'Diff'
    ],
    "Set 2": [
        #'timestamp',
        'out.electricity.heating.energy_consumption',
        'Diff',
        'sqft',
        'representative_income',
        'occupants',
        'usage_level',
        'rmb_heating_primary',
        'rmb_heatpump_backup'
    ],
    "Set 3": [
        #'timestamp',
        'out.electricity.heating.energy_consumption',
        'Diff',
        'vintage',
        'rmb_window_area',
        'duct_leakage_and_insulation',
        'heating_setpoint',
        'hvac_heating_efficiency'
    ],
    "Set 4": [
        #'timestamp',
        'out.electricity.heating.energy_consumption',
        'Diff',
        'sqft',
        'representative_income',
        'occupants',
        'usage_level',
        'rmb_heating_primary',
        'rmb_heatpump_backup',
        'vintage',
        'rmb_window_area',
        'duct_leakage_and_insulation',
        'heating_setpoint',
        'hvac_heating_efficiency'
    ]
}

# Define the targets
targets = {
    "Backup Heat": 'out.electricity.heating_hp_bkup.energy_consumption',
    "Main Heat": 'out.electricity.heating.energy_consumption'
}

# Models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=555),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=555),
    "SVR": SVR(kernel='rbf'),
    "Decision Tree": DecisionTreeRegressor(random_state=555),
    "K-Neighbors": KNeighborsRegressor(n_neighbors=5)
}

# Create a directory to store models
model_dir = "saved_models_" + JSON_STR
os.makedirs(model_dir, exist_ok=True)

# Initialize scaler
scaler = StandardScaler()

# Define the output file name using JSON_STR
output_file = f"model_evaluation_{JSON_STR}.txt"

# Open the output file in write mode before the loops
with open(output_file, "w") as f:
    # Iterate through feature sets
    for set_name, features in feature_sets.items():
        print("--------------------------------------------------")
        print(f"\nHeader for Feature Set: {set_name} ({features})")

        # Iterate through targets
        for target_name, target in targets.items():
            print("--------------------------------------------------")
            print(f"\nHeader for Target: {target_name}")

            # Prepare the data
            X = df_sampled[features]
            y = df_sampled[target]

            # Handle timestamp if included
            if 'Main Heat' == target_name:
                # X['timestamp'] = pd.to_datetime(X['timestamp'])
                # X['hour'] = X['timestamp'].dt.hour
                # X['month'] = X['timestamp'].dt.month
                X = X.drop(columns=['out.electricity.heating.energy_consumption'])

            # Scale features
            X_scaled = scaler.fit_transform(X)

            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=555)

            # Evaluate models
            for model_name, model in models.items():
                print("--------------------------------------------------")
                print(f"\nEvaluating Feature Set: {set_name} ({features})")
                print(f"\nTarget: {target_name}")
                print(f"\nModel: {model_name}")

                f.write("--------------------------------------------------\n")
                f.write(f"Evaluating Feature Set: {set_name} ({features})\n")
                f.write(f"Target: {target_name}\n")
                f.write(f"Model: {model_name}\n")

                metrics_str = evaluate_model(
                    model, X_train, X_test, y_train, y_test,
                    model_name=model_name,
                    feature_set_name=set_name,
                    target_name=target_name,
                    output_dir="plots" + '_' + JSON_STR
                )
                # Write the metrics to the file
                f.write(metrics_str + "\n")

                '''
                # Save the model
                model_filename = f"{model_dir}/{model_name}_{set_name}_{target_name}.joblib"
                joblib.dump(model, model_filename)
                print(f"Model saved as: {model_filename}")
                '''
                # Example usage in the loop
                model_filename = f"{model_dir}/{model_name}_{set_name}_{target_name}.joblib"
                print(model_filename)
                save_model(model, model_filename)
                print(f"Model saved as: {model_filename}")
                f.write(f"Model saved as: {model_filename}\n")

END_PROCESS = time.time()
TIME_ELAPSED = -START_PROCESS + END_PROCESS
print('\n TIME ELAPSED:')
print(str(TIME_ELAPSED) + ' seconds /', str(TIME_ELAPSED/60) + ' minutes.')