import sys
import os
parent_dir = os.path.abspath("../../libraries/surrogate_modelling")
sys.path.insert(0, parent_dir)

from distributions import *
from simparameter import SimParameter
from simparameter_set import SimParamSet
from surrogate_model import SurrogateModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from digital_twin import DigitalTwin
import utils
from surrogate_model import *

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

x_df = pd.read_csv('../data/sine_data/example_x_df.csv')
y_df = pd.read_csv('../data/sine_data/example_y_df.csv')

Q = utils.get_simparamset_from_data(x_df)
QoI_names = y_df.columns.to_list()

max_index = 2
QoI_param = 't_3'

subtracted_effects = ['a','b','c']

z_m_df = pd.read_csv('../data/sine_data/example_z_m_df.csv')
sigma = pd.read_csv('../data/sine_data/example_sigma_df.csv')

nwalkers = 64
nburn = 800
niter = 200

E = utils.generate_stdrn_simparamset(sigma.values.reshape(-1, 1))

q = np.load('../data/sine_data/example_q.npy')
q_df = pd.read_csv('../data/sine_data/example_q_df.csv')

method = "gPCE"

# Model configurations
match method:
    # DNN model configurations
    case "DNN":
        config = {  
            'init_config' : {
                'layers': [
                    {'neurons': 512, 'activation': 'relu', 'dropout': 0.2},
                    {'neurons': 256, 'activation': 'sigmoid', 'dropout': 0.2},
                    {'neurons': 128, 'activation': 'relu', 'dropout': None},
                    ],
                'outputAF': 'tanh'
                },
            'train_config' : {
                'optimizer': 'Adam',
                'loss': 'MSE',
                'epochs': 100,
                'batch_size': 32,
                'k_fold': None,
                'early_stopping': {
                    'patience': 25,
                    'min_delta': 0.0001}
                }
                }
    # gPCE model configurations
    case "gPCE":
        config = {
            'init_config' : {
            'p' : 5
            }, 
            'train_config' : {
                'k_fold': 5
                }
        }
    # GBT model configurations
    case "GBT":
        config = {
            'init_config' : {
                'gbt_method': 'xgboost'
            },
            'train_config' : {
                'max_depth': 3,
                'num_of_iter': 250,
                'k_fold': 5
                }
        }

split_config = {
    'train_test_ratio': 0.2, 
    'random_seed': 1997,
    'split_type': 'no_shuffle'
    }

# Initialize surrogate model
# This creates an instance of the SurrogateModel class using the provided sampling data (Q)
model = SurrogateModel(Q, QoI_names, method, **config['init_config'])
# Split data into training and testing sets
model.train_test_split(x_df, y_df, **split_config)
# Train the model
model.train(model.X_train, model.y_train, **config['train_config'])

----- Training started for 'gPCE' model -----
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Average train loss: 0.00000000000000, Average valid loss: 0.00000000000000
----- Training ended for 'gPCE' model -----

# get mean and variance of surrogate model
mean, var = model.get_mean_and_var()
mean, var

(tensor([5.7904e-04, 3.4249e-01, 6.4277e-01, 8.6492e-01, 9.8207e-01, 9.8027e-01,
         8.6017e-01, 6.3689e-01, 3.3810e-01, 5.3171e-04], dtype=torch.float64),
 tensor([9.7192e-05, 4.6615e-04, 1.0875e-03, 1.0816e-03, 3.9445e-04, 5.1472e-04,
         3.8145e-03, 1.1676e-02, 2.2656e-02, 3.2258e-02], dtype=torch.float64))

# set max_index for Sobol sensitivity analysis
max_index = 2
partial_variance, sobol_index = model.get_sobol_sensitivity(max_index)

# Plot Sobol Sensitivity Index
# The 'plot_sobol_sensitivity' method generates a plot of Sobol Sensitivity indices, which quantify 
# the contribution of each input parameter to the output uncertainty. 
# - max_index: Specifies the maximum index of parameters to consider in the plot.
# - param_name: The name of the quantity of interest (QoI) for which sensitivity is being calculated ('t_3' in this case).
fig = model.plot_sobol_sensitivity()

fig = model.plot_sobol_sensitivity(max_index=max_index, param_name=QoI_param)

# Calculate SHAP (Shapley Additive Explanations) values
# The 'get_shap_values' method computes the SHAP values for the model's test data (model.X_test). 
# SHAP values explain the contribution of each input feature to the prediction for each test sample.
# SHAP values help to interpret the model's decisions and understand the impact of each input parameter on the output.
shap_values = await model.get_shap_values(model.X_test)

Message: sample size for shap values is set to 100.

  0%|          | 0/100 [00:00<?, ?it/s]

# Plot SHAP Single Waterfall Plot
# The 'plot_shap_single_waterfall' method generates a SHAP waterfall plot for a single test sample, 
# visualizing how each feature contributes to the model's prediction for that sample.
# - q: A DataFrame containing the sample data (e.g., a specific parameter set) for which the SHAP values are calculated.
# - param_name: The name of the quantity of interest (QoI) being analyzed ('t_3' in this case).
# The SHAP waterfall plot shows the cumulative effect of each feature on the model's prediction, helping to interpret individual predictions.
fig = await model.plot_shap_single_waterfall(q=q_df, param_name=QoI_param)

  0%|          | 0/1 [00:00<?, ?it/s]

fig = await model.plot_shap_multiple_waterfalls(q=q_df)

  0%|          | 0/1 [00:00<?, ?it/s]

# Plot SHAP Beeswarm Plot
# The 'plot_shap_beeswarm' method generates a SHAP beeswarm plot, which visualizes the distribution of SHAP values
# for a range of test samples, showing the impact of each feature on the model’s predictions.
# - q: The test data (model.X_test.iloc[:100]) is selected for which SHAP values are calculated and visualized.
# - param_name: The name of the quantity of interest (QoI) for which SHAP values are computed ('t_3' in this case).
# The SHAP beeswarm plot shows how different input features influence the predictions across multiple samples,
# helping to understand the global feature importance and the relationships between features and the target.
fig = await model.plot_shap_beeswarm(param_name=QoI_param)

effects = await model.subtract_effects(model.X_test.iloc[:100], model.y_test.iloc[:100], subtracted_effects)
fig = await model.plot_effects(effects)

Message: sample size for shap values is set to 100, which is the number of samples.

  0%|          | 0/100 [00:00<?, ?it/s]

figure, alert, remaining_effects, error_ratio, outliers = await model.plot_subtract_effects_and_alert(q_df, z_m_df, subtracted_effects[1:])

Message: sample size for shap values is set to 200, which is the number of samples.

  0%|          | 0/200 [00:00<?, ?it/s]

Message: sample size for shap values is set to 1, which is the number of samples.

  0%|          | 0/1 [00:00<?, ?it/s]

fig = utils.plot_prior(model)

DT = DigitalTwin(model, E)
DT.update(z_m_df, nwalkers=nwalkers, nburn=nburn, niter=niter)
DT.get_mean_and_var_of_posterior()
gpc_map = DT.get_MAP()
fig = utils.plot_MCMC(model, DT, nwalkers=nwalkers, map_point=gpc_map)

MCMC creating
Burning period

100%|██████████| 800/800 [01:07<00:00, 11.77it/s]

MCMC running

100%|██████████| 200/200 [00:16<00:00, 11.90it/s]

--- 84.87607264518738 seconds ---

Data-Driven Modeling of a Sine Function¶

Imports¶

Load data¶

For training and sensitivity analysis¶

For measurements¶

For update¶

Train model¶

Choosing the Model¶

Sobol sensitivities¶

SHAP values¶

Effects¶

Update¶