import sys
import os
parent_dir = os.path.abspath("../../libraries/surrogate_modelling")
sys.path.insert(0, parent_dir)

from distributions import *
from simparameter import SimParameter
from simparameter_set import SimParamSet
from surrogate_model import SurrogateModel
from preprocess_modeshape_data import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from digital_twin import DigitalTwin
from surrogate_model import *
import numpy as np
import seaborn as sns
import warnings
from utils import *

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

df_modeshapes = df_from_unv('../data/OMA hospital/hospital_data_06_01.unv')
weather_df = preprocess_weather_data('../data/OMA hospital/granada_weather_data.csv')

modeshapes_df = merge_dataframes(df_modeshapes, weather_df)
modeshapes_df.head(2)

fig = plot_correlation_mx(modeshapes_df, annot=False)

# selecting modes to use
modes = [0,1]

# selecting environmental effects to use
X_cols = ['Temperature', 'Humidity']

# choose from: frequency, frequency+modeshapes, modeshapes, or custom list of columns
y_cols = 'frequency+modeshapes'

X, y, Q, QoI_names = select_cols(modeshapes_df, modes, X_cols, y_cols)
y, QoI_names, const_cols = remove_constant_columns(y, QoI_names) # removing constant columns

Constant columns ['phi_sensor_5_real_mode_0'] were found and excluded.

Q = utils.get_simparamset_from_data(X)
QoI_names = y.columns.to_list()

max_index = 1
QoI_param = 'frequency_mode_1' # Quantity of Interest parameter

subtracted_effects = ['Temperature', 'Humidity']

nwalkers = 64
nburn = 800
niter = 200

sigma = y.std(axis=0) # standard deviation of the data
E = utils.generate_stdrn_simparamset(sigma.values)

config_dnn = {  
                'init_config' : {
                    'layers': [
                        {'neurons': 32, 'activation': 'relu', 'dropout': 0.1},
                        {'neurons': 64, 'activation': 'sigmoid', 'dropout': 0.1},
                        {'neurons': 128, 'activation': 'relu', 'dropout': None},
                        ],
                    'outputAF': 'tanh'
                    },
                'train_config' : {
                    'optimizer': 'Adam',
                    'loss': 'MSE',
                    'epochs': 200,
                    'batch_size': 32,
                    'k_fold': None,
                    'early_stopping': {
                        'patience': 50,
                        'min_delta': 0.0001}
                    }
}

config_gbt = {
    'init_config' : {
        'gbt_method': 'xgboost'
    },
    'train_config' : {
        'max_depth': 6,
        'num_of_iter': 250,
        'k_fold': 5
        }
}

config_gpce = {
    'init_config' : {
    'p' : 2
    }, 
    'train_config' : {
        'k_fold': 5
        }
}
config_linreg = {
    'init_config': {},
    'train_config': {}
}

config = {'LinReg': config_linreg, 'DNN': config_dnn, 'GBT': config_gbt, 'gPCE': config_gpce}

split_config = {
        'train_test_ratio': 0.2, 
        'random_seed': 1997,
        'split_type': 'no_shuffle'
}

method = 'LinReg'

# Initialize surrogate model
model = SurrogateModel(Q, QoI_names, method, **config.get(method)['init_config'])
# Split data into training and testing sets
model.train_test_split(X, y, **split_config)

# fill missing data
for col in QoI_names:
    model.y_train[col] = model.y_train[col].fillna(model.y_train[col].mean())
    model.y_test[col] = model.y_test[col].fillna(model.y_train[col].mean())

# Train the model
model.train(model.X_train, model.y_train, **config.get(method)['train_config'])
metrics, agg_metrics = model.evaluate_model(verbose=False)
print('\nEvaluation metrics:')
metrics

----- Training started for 'LinReg' model -----
Average train loss: 0.00093359315806, Average valid loss: 0.00137374123858
----- Training ended for 'LinReg' model -----

Evaluation metrics:

# get mean and variance of surrogate model
mean, var = model.get_mean_and_var()
mean, var

(array([ 3.67600713e+00,  3.36522458e-02,  5.59306544e-02,  5.37689401e-01,
        -3.03495188e-02,  5.65309610e-02,  2.10715184e-02, -3.32197012e-03,
         8.13386295e-02,  0.00000000e+00,  1.24268784e-01,  1.06129552e-01,
         3.86785004e+00,  5.28932836e-01, -2.54485375e-02,  1.11790833e-01,
        -8.61779369e-02,  4.31065075e-01, -1.30759358e-02,  9.99505976e-01,
         1.59506780e-04,  1.71269194e-01, -1.85669054e-01,  8.82467821e-01,
        -3.82528935e-03]),
 array([6.73121666e-07, 2.74753228e-05, 7.90890535e-05, 1.57804492e-06,
        7.02721580e-06, 5.04077143e-06, 4.42284174e-05, 4.72563533e-05,
        1.41587582e-04, 0.00000000e+00, 1.37709870e-06, 1.69387603e-04,
        8.72258422e-04, 5.54953922e-05, 5.04862713e-06, 1.58724691e-04,
        9.28619031e-05, 3.04311118e-04, 1.27275674e-05, 1.19975189e-07,
        1.39509606e-07, 1.02748180e-03, 2.85129318e-04, 2.24193154e-04,
        9.96881476e-06]))

q_df = model.X_test.iloc[23]
q_df = pd.DataFrame([q_df], columns=model.Q.param_names())
z_m_df = pd.DataFrame([model.y_test.iloc[23]], columns=QoI_names)

# set max_index for Sobol sensitivity analysis
max_index = 3
partial_variance, sobol_index = model.get_sobol_sensitivity(max_index)

# Plot Sobol Sensitivity Index
# The 'plot_sobol_sensitivity' method generates a plot of Sobol Sensitivity indices, which quantify 
# the contribution of each input parameter to the output uncertainty. 
# - max_index: Specifies the maximum index of parameters to consider in the plot.
# - param_name: The name of the quantity of interest (QoI) for which sensitivity is being calculated ('frequency_mode_1' in this case).
fig = model.plot_sobol_sensitivity(max_index)

fig = model.plot_sobol_sensitivity(max_index=max_index, param_name=QoI_param)

# Calculate SHAP (Shapley Additive Explanations) values
# The 'get_shap_values' method computes the SHAP values for the model's test data (model.X_test). 
# SHAP values explain the contribution of each input feature to the prediction for each test sample.
# SHAP values help to interpret the model's decisions and understand the impact of each input parameter on the output.
shap_values = await model.get_shap_values(model.X_test)

Message: sample size for shap values is set to 100.

  0%|          | 0/100 [00:00<?, ?it/s]

# Plot SHAP Single Waterfall Plot
# The 'plot_shap_single_waterfall' method generates a SHAP waterfall plot for a single test sample, 
# visualizing how each feature contributes to the model's prediction for that sample.
# - q: A DataFrame containing the sample data (e.g., a specific parameter set) for which the SHAP values are calculated.
# - param_name: The name of the quantity of interest (QoI) being analyzed ('frequency_mode_1' in this case).
# The SHAP waterfall plot shows the cumulative effect of each feature on the model's prediction, helping to interpret individual predictions.
fig = await model.plot_shap_single_waterfall(q=q_df, param_name=QoI_param)

  0%|          | 0/1 [00:00<?, ?it/s]

fig = await model.plot_shap_multiple_waterfalls(q=q_df)

  0%|          | 0/1 [00:00<?, ?it/s]

# Plot SHAP Beeswarm Plot
# The 'plot_shap_beeswarm' method generates a SHAP beeswarm plot, which visualizes the distribution of SHAP values
# for a range of test samples, showing the impact of each feature on the model’s predictions.
# - q: The test data (model.X_test.iloc[:100]) is selected for which SHAP values are calculated and visualized.
# - param_name: The name of the quantity of interest (QoI) for which SHAP values are computed ('frequency_mode_1' in this case).
# The SHAP beeswarm plot shows how different input features influence the predictions across multiple samples,
# helping to understand the global feature importance and the relationships between features and the target.
fig = await model.plot_shap_beeswarm(param_name=QoI_param)

effects = await model.subtract_effects(model.X_test.iloc[:100], model.y_test.iloc[:100], subtracted_effects)
fig = await model.plot_effects(effects, xticks=False)

Message: sample size for shap values is set to 100, which is the number of samples.

  0%|          | 0/100 [00:00<?, ?it/s]

figure, alert, remaining_effects, error_ratio, outliers = await model.plot_subtract_effects_and_alert(q_df, z_m_df, subtracted_effects, xticks=False)

Using 394 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

Message: sample size for shap values is set to 394, which is the number of samples.

  0%|          | 0/394 [00:00<?, ?it/s]

Message: sample size for shap values is set to 1, which is the number of samples.

  0%|          | 0/1 [00:00<?, ?it/s]

fig = utils.plot_prior(model)

DT = DigitalTwin(model, E)
DT.update(z_m_df, nwalkers=nwalkers, nburn=nburn, niter=niter)
DT.get_mean_and_var_of_posterior()
gpc_map = DT.get_MAP()
fig = utils.plot_MCMC(model, DT, nwalkers=nwalkers, map_point=gpc_map)

MCMC creating
Burning period

100%|██████████| 800/800 [01:11<00:00, 11.11it/s]

MCMC running

100%|██████████| 200/200 [00:19<00:00, 10.05it/s]

--- 92.00037240982056 seconds ---

# selecting modes to use
modes = [0,1,2,3,6]

# selecting environmental effects to use
X_cols = ['Temperature', 'Humidity']

# choose from: frequency, frequency+modeshapes, modeshapes, or custom list of columns
# here using only frequency columns
y_cols = 'frequency'

X, y, Q, QoI_names = select_cols(modeshapes_df, modes, X_cols, y_cols)

model_types = ['LinReg', 'DNN', 'GBT', 'gPCE']
metrics_all = []

for model_type in model_types:
    model = SurrogateModel(Q, QoI_names, model_type, **config.get(model_type)['init_config'])
    model.train_test_split(X, y, **split_config)

    # fill missing data
    for col in QoI_names:
        model.y_train[col] = model.y_train[col].fillna(model.y_train[col].mean())
        model.y_test[col] = model.y_test[col].fillna(model.y_train[col].mean())

    model.train(model.X_train, model.y_train, **config.get(model_type)['train_config'])
    metrics, agg_metrics = model.evaluate_model(verbose=False)
    metrics['Model Name'] = model_type
    metrics = metrics.reset_index()
    metrics_all.append(metrics)

metrics = pd.concat(metrics_all, axis=0)
col = metrics.pop('Model Name')
metrics.insert(0, 'Model Name', col)
print('\nEvaluation metrics:')
metrics

----- Training started for 'LinReg' model -----
Average train loss: 0.00178296938350, Average valid loss: 0.00192368640626
----- Training ended for 'LinReg' model -----
Using device: cpu
----- Training started for 'DNN' model -----
Epoch [0/200], train loss: 0.1987, validation loss: 0.1562
Epoch [50/200], train loss: 0.0115, validation loss: 0.0133
Epoch [100/200], train loss: 0.0112, validation loss: 0.0130
Early stopping triggered after 124 epochs!
Training stopped early. Restoring best model (monitored val_loss = 0.0129).
Average train loss: 0.01120073388724, Average valid loss: 0.01303984465577
----- Training ended for 'DNN' model -----
----- Training started for 'GBT' model -----
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Average train loss: 0.00009870674395, Average valid loss: 0.00291320609061
----- Training ended for 'GBT' model -----
----- Training started for 'gPCE' model -----
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Average train loss: 0.00177573805167, Average valid loss: 0.00184941096434
----- Training ended for 'gPCE' model -----

Evaluation metrics:

# metrics visualization
sns.set(style="whitegrid")
metric_cols = ['NRMSE', 'rel_RMSE']
for metric in metric_cols:
    plt.figure(figsize=(6, 3))
    sns.lineplot(data=metrics, x='Output_Name', y=metric, hue='Model Name')
    plt.xticks(rotation=50, fontsize=10)
    plt.ylabel('Metric Value')
    plt.xlabel('Feature', fontsize=12)
    plt.title(f'{metric} for All Models per Feature')
    plt.legend(ncol=1, fontsize='small', bbox_to_anchor=(1,1))
plt.show()

	mode_id	frequency	xi	phi_sensor_1_real	phi_sensor_1_imag	phi_sensor_2_real	phi_sensor_2_imag	phi_sensor_3_real	phi_sensor_3_imag	phi_sensor_4_real	...	phi_sensor_5_imag	phi_sensor_6_real	phi_sensor_6_imag	Datetime	Temperature	Humidity	Dew Point	Apparent Temperature	Wind Speed	Wind Direction
0	1	3.86939	0.03289	0.537656	-0.016287	0.082241	-0.088756	0.452694	0.007940	1.0	...	-0.228746	0.972776	0.003462	2025-02-04 14:23:48	11.8	51.0	2.1	9.7	1.8	276.0
1	3	5.84158	0.03510	0.658683	-0.069137	-0.554060	-0.011493	-0.688572	0.150488	1.0	...	0.085707	-1.087620	0.048649	2025-02-04 14:23:48	11.8	51.0	2.1	9.7	1.8	276.0

	Kendall_tau	Pearson	Spearman	MSE	MAE	RMSE	STD_of_label	NRMSE	NMAE	rel_RMSE	summed_metric
Output_Name
frequency_mode_0	0.089125	0.128409	0.113541	1.885709e-03	1.394662e-02	4.342475e-02	0.034972	0.111466	0.035799	1.241702	-0.303542
phi_sensor_1_real_mode_0	0.027417	0.001583	0.035429	3.828094e-04	7.920714e-03	1.956552e-02	0.020182	0.095744	0.038760	0.969458	-0.301676
phi_sensor_1_imag_mode_0	-0.013156	-0.039521	-0.016841	3.775581e-04	8.773910e-03	1.943086e-02	0.021525	0.102604	0.046330	0.902732	-0.324084
phi_sensor_2_real_mode_0	0.144366	0.146593	0.184928	1.773073e-04	5.735897e-03	1.331568e-02	0.013777	0.074057	0.031901	0.966518	-0.163543
phi_sensor_2_imag_mode_0	-0.065811	-0.082379	-0.084083	1.228193e-04	4.326586e-03	1.108239e-02	0.012575	0.071227	0.027807	0.881307	-0.371193
phi_sensor_3_real_mode_0	0.031128	0.032143	0.039725	2.240161e-04	5.918088e-03	1.496717e-02	0.016479	0.089545	0.035406	0.908255	-0.268420
phi_sensor_3_imag_mode_0	-0.029569	-0.058572	-0.038104	3.600933e-04	8.132650e-03	1.897612e-02	0.020461	0.088162	0.037784	0.927420	-0.351222
phi_sensor_4_real_mode_0	0.028976	0.027880	0.036993	8.712248e-04	1.106307e-02	2.951652e-02	0.032641	0.081600	0.030584	0.904273	-0.270141
phi_sensor_4_imag_mode_0	0.003144	-0.021485	0.003933	1.067629e-03	1.404269e-02	3.267460e-02	0.032935	0.102268	0.043952	0.992080	-0.335496
phi_sensor_5_imag_mode_0	NaN	NaN	NaN	6.504440e-40	8.592467e-22	2.550380e-20	0.000000	inf	inf	NaN	0.000000
phi_sensor_6_real_mode_0	0.024831	0.005620	0.031963	7.105216e-04	9.809696e-03	2.665561e-02	0.027493	0.102165	0.037598	0.969550	-0.302379
phi_sensor_6_imag_mode_0	-0.025279	-0.044983	-0.032409	7.366666e-04	1.299651e-02	2.714160e-02	0.031520	0.080426	0.038511	0.861086	-0.321252
frequency_mode_1	0.340718	0.547087	0.486452	3.225526e-03	4.091083e-02	5.679372e-02	0.029734	0.314020	0.226202	1.910030	-0.178591
phi_sensor_1_real_mode_1	0.235883	0.280647	0.345562	4.778868e-04	1.449108e-02	2.186062e-02	0.018406	0.122710	0.081342	1.187666	-0.108525
phi_sensor_1_imag_mode_1	-0.001808	-0.004901	-0.002672	4.721987e-04	1.244827e-02	2.173013e-02	0.015610	0.132739	0.076041	1.392066	-0.467149
phi_sensor_2_real_mode_1	0.059839	0.084445	0.090034	1.858869e-03	3.319670e-02	4.311461e-02	0.042335	0.183201	0.141058	1.018408	-0.261363
phi_sensor_2_imag_mode_1	0.132250	0.201355	0.197384	1.889111e-03	3.345730e-02	4.346391e-02	0.038809	0.145762	0.112203	1.119945	-0.196319
phi_sensor_3_real_mode_1	0.029267	0.026855	0.043855	4.872248e-04	1.567161e-02	2.207317e-02	0.019313	0.085140	0.060448	1.142908	-0.347643
phi_sensor_3_imag_mode_1	0.144650	0.196427	0.215271	5.436915e-04	1.489610e-02	2.331719e-02	0.019295	0.093102	0.059478	1.208482	-0.217378
phi_sensor_4_real_mode_1	0.128943	0.011331	0.158480	1.133889e-04	1.345422e-03	1.064842e-02	0.004872	0.128727	0.016265	2.185421	-0.628889
phi_sensor_4_imag_mode_1	0.104741	-0.024860	0.129700	9.967324e-05	8.627191e-04	9.983648e-03	0.003601	0.103435	0.008938	2.772788	-0.854402
phi_sensor_5_real_mode_1	0.038308	0.070715	0.057969	6.730534e-03	6.364064e-02	8.203983e-02	0.080370	0.172417	0.133749	1.020772	-0.284593
phi_sensor_5_imag_mode_1	0.009163	-0.008211	0.014147	6.228319e-03	6.036036e-02	7.891970e-02	0.071172	0.147641	0.112921	1.108856	-0.364586
phi_sensor_6_real_mode_1	-0.002862	-0.022185	-0.005775	1.406006e-03	2.736220e-02	3.749675e-02	0.034507	0.131767	0.096153	1.086652	-0.372491
phi_sensor_6_imag_mode_1	0.116870	0.182370	0.175459	9.516664e-04	2.092374e-02	3.084909e-02	0.027285	0.135005	0.091569	1.130621	-0.218640

	Model Name	Output_Name	Kendall_tau	Pearson	Spearman	MSE	MAE	RMSE	STD_of_label	NRMSE	NMAE	rel_RMSE	summed_metric
0	LinReg	frequency_mode_0	0.089125	0.128409	0.113541	0.001886	0.013947	0.043425	0.034972	0.111466	0.035799	1.241702	-0.303542
1	LinReg	frequency_mode_1	0.340718	0.547087	0.486452	0.003226	0.040911	0.056794	0.029734	0.314020	0.226202	1.910030	-0.178591
2	LinReg	frequency_mode_2	0.204298	0.360034	0.284559	0.005790	0.053906	0.076093	0.053357	0.184848	0.130950	1.426102	-0.192404
3	LinReg	frequency_mode_3	0.308793	0.517708	0.445463	0.010326	0.074622	0.101618	0.055812	0.309189	0.227051	1.820713	-0.182916
4	LinReg	frequency_mode_6	-0.030514	-0.060238	-0.039075	0.005296	0.030422	0.072776	0.047945	0.103818	0.043398	1.517911	-0.549246
0	DNN	frequency_mode_0	0.072761	0.101677	0.093052	0.001850	0.016280	0.043015	0.034972	0.110414	0.041788	1.229983	-0.320831
1	DNN	frequency_mode_1	0.339432	0.534457	0.484566	0.003250	0.040893	0.057009	0.029734	0.315213	0.226104	1.917283	-0.186276
2	DNN	frequency_mode_2	0.207524	0.360551	0.288660	0.005813	0.053720	0.076242	0.053357	0.185210	0.130500	1.428900	-0.190722
3	DNN	frequency_mode_3	0.302901	0.508556	0.437004	0.010003	0.073719	0.100016	0.055812	0.304314	0.224301	1.792007	-0.181182
4	DNN	frequency_mode_6	0.003178	0.014432	0.003875	0.005166	0.029669	0.071876	0.047945	0.102534	0.042323	1.499135	-0.492550
0	GBT	frequency_mode_0	0.003386	0.006614	0.004493	0.002693	0.026704	0.051893	0.034972	0.133203	0.068544	1.483852	-0.489786
1	GBT	frequency_mode_1	0.190601	0.314997	0.278862	0.003590	0.044252	0.059918	0.029734	0.331296	0.244673	2.015111	-0.410217
2	GBT	frequency_mode_2	0.106375	0.240772	0.151284	0.007289	0.062870	0.085373	0.053357	0.207392	0.152726	1.600031	-0.367200
3	GBT	frequency_mode_3	0.212311	0.369922	0.309131	0.010947	0.079378	0.104626	0.055812	0.318342	0.241519	1.874612	-0.327749
4	GBT	frequency_mode_6	-0.047848	-0.045821	-0.060836	0.006760	0.039878	0.082219	0.047945	0.117288	0.056887	1.714860	-0.623122
0	gPCE	frequency_mode_0	0.067688	0.104934	0.086881	0.001857	0.016248	0.043093	0.034972	0.110613	0.041708	1.232207	-0.324235
1	gPCE	frequency_mode_1	0.335200	0.538900	0.480222	0.002926	0.039265	0.054092	0.029734	0.299084	0.217104	1.819181	-0.154953
2	gPCE	frequency_mode_2	0.197708	0.347071	0.276826	0.005842	0.054538	0.076431	0.053357	0.185671	0.132486	1.432455	-0.203617
3	gPCE	frequency_mode_3	0.301380	0.512786	0.436878	0.009720	0.072601	0.098589	0.055812	0.299972	0.220901	1.766437	-0.171797
4	gPCE	frequency_mode_6	0.026699	0.065738	0.034626	0.005185	0.033240	0.072007	0.047945	0.102721	0.047418	1.501869	-0.458269

Data-Driven SHM of Hospital Real¶

Overview¶

Imports¶

Load data¶

Load modeshape and weather data¶

Merge dataframes¶

Filter modes and features¶

For training and sensitivity analysis¶

For update¶

Train model¶

Model configurations¶

LinReg model¶

Sobol sensitivities¶

SHAP values¶

Effects¶

Update¶

Comparing models¶