# Standard library imports
import os
import re
import pickle
import warnings
from collections import Counter

# Data processing and numerical computing
import numpy as np
import pandas as pd
import scipy

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Chemistry and molecular descriptors
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, QED
from rdkit.Chem import rdMolDescriptors, rdPartialCharges, GraphDescriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K
import keras_tuner as kt

# Specialized libraries
from matimpute import MatImputer
import joblib

# Configuration
warnings.filterwarnings('ignore', category=UserWarning)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Display options for better data inspection
pd.set_option('display.max_columns', 20)
pd.set_option('display.precision', 4)

def analyze_missing_data(df):
    """
    Analyze missing data patterns and return statistics.
    
    Args:
        df: DataFrame to analyze
        
    Returns:
        Dictionary with missing data statistics
    """
    missing_stats = {}
    total_samples = len(df)
    
    print("\n" + "="*60)
    print("MISSING DATA ANALYSIS")
    print("="*60)
    
    for column in df.columns:
        missing_count = df[column].isna().sum()
        missing_pct = (missing_count / total_samples) * 100
        available_count = total_samples - missing_count
        
        missing_stats[column] = {
            'missing_count': missing_count,
            'missing_percentage': missing_pct,
            'available_count': available_count
        }
        
        print(f"{column:>8}: {available_count:>4} available ({missing_pct:>5.1f}% missing)")
    
    return missing_stats

data = pd.read_csv('train.csv')

# Display basic information
print("\nDataset Overview:")
print(f"Shape: {data.shape}")
    
# Analyze missing data
missing_stats = analyze_missing_data(data)

# Display first few rows
print("\nFirst 5 rows:")
display(data.head())

Dataset Overview:
Shape: (7973, 7)

============================================================
MISSING DATA ANALYSIS
============================================================
      id: 7973 available (  0.0% missing)
  SMILES: 7973 available (  0.0% missing)
      Tg:  511 available ( 93.6% missing)
     FFV: 7030 available ( 11.8% missing)
      Tc:  737 available ( 90.8% missing)
 Density:  613 available ( 92.3% missing)
      Rg:  614 available ( 92.3% missing)

First 5 rows:

# Check for duplicates
print(f'Number of duplicate records: {data.duplicated().sum()}')

Number of duplicate records: 0

# Check for invalid SMILES
invalid_smiles = []

for idx, smiles in enumerate(data['SMILES']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(idx)

print(f'Number of invalid SMILES in dataset: {len(invalid_smiles)}')

Number of invalid SMILES in dataset: 0

# Check for duplicate SMILES
print(f'Number of duplicate SMILES strings in dataset: {data["SMILES"].duplicated().sum()}')

Number of duplicate SMILES strings in dataset: 0

#Remove ID column (not needed for modeling)
data.drop('id',axis=1, inplace=True)

# Display basic statistics for property columns
property_stats = data.drop(['SMILES'], axis=1).describe()
display(property_stats)

# Check for outliers (values beyond 3 standard deviations)
for col in data.drop(['SMILES'], axis=1):
    if col in data.columns:
        values = data[col].dropna()
        if len(values) > 0:
            mean_val = values.mean()
            std_val = values.std()
            outliers = ((values - mean_val).abs() > 3 * std_val).sum()
            print(f"{col:>8}: {outliers} outliers ({outliers/len(values)*100:.1f}%)")

      Tg: 5 outliers (1.0%)
     FFV: 69 outliers (1.0%)
      Tc: 0 outliers (0.0%)
 Density: 12 outliers (2.0%)
      Rg: 2 outliers (0.3%)

from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from matimpute import MatImputer
import numpy as np

# Descriptors to drop due to computational issues or redundancy
PROBLEMATIC_DESCRIPTORS = [
    'NumRadicalElectrons',  # Often zero for stable molecules
    'BCUT2D_MWHI', 'BCUT2D_MWLOW',  # BCUT descriptors can be unstable
    'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI', 'BCUT2D_MRLOW',
    'SMR_VSA8', 'SlogP_VSA9',  # Specific VSA descriptors with issues
    # Fragment descriptors rarely relevant for polymers
    'fr_barbitur', 'fr_benzodiazepine', 'fr_dihydropyridine',
    'fr_epoxide', 'fr_isothiocyan', 'fr_lactam', 'fr_nitroso',
    'fr_prisulfonamd', 'fr_thiocyan'
]

def extract_descriptors(df):

    # Extract molecular descriptors using RDKit
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
    
    def compute_descriptors(smiles):
        mol = Chem.MolFromSmiles(smiles) 
        if mol is None:
            return [np.nan] * len(descriptor_names)
        descriptors = list(calculator.CalcDescriptors(mol))
        # Replace infinite values with NaN
        descriptors = [np.nan if np.isinf(x) else round(x,4) for x in descriptors]
        return descriptors

    descriptor_data = df["SMILES"].apply(compute_descriptors)
    descriptor_df = pd.DataFrame(descriptor_data.tolist(), columns=descriptor_names)

    #Combine descriptors with original data
    combined_df = pd.concat([df.reset_index(drop=True), descriptor_df], axis=1)
    
    combined_df = combined_df.drop(PROBLEMATIC_DESCRIPTORS, axis=1)

    # take log of Ipc to address 'too large float' error with imputer
    combined_df['Ipc'] = np.log(combined_df['Ipc'])
   
    return combined_df

features_all = extract_descriptors(data)

features_all

# Prepare the feature matrix (e.g., molecular descriptors + available properties)

def impute_properties(df):
    
    # Remove SMILES to apply imputer to features only
    features = df.drop(['SMILES'], axis=1)
    
    # Initialize MatImputer
    imputer = MatImputer()
    
    # transform the data
    imputed_data = imputer.transform(features)
    
    if isinstance(imputed_data, pd.DataFrame):
        imputed_df = imputed_data
    else:
        # If imputer returns numpy array, convert to DataFrame
        imputed_df = pd.DataFrame(imputed_data, columns=features.columns, index=features.index)
    
    # Combine with SMILES column
    df_imputed = pd.concat([df[['SMILES']].reset_index(drop=True), imputed_df.reset_index(drop=True)], axis=1)

    return df_imputed

features_all = impute_properties(features_all)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Clean data
X = features_all.drop(['Density', 'Tg', 'FFV', 'Tc', 'Rg', 'SMILES'], axis=1)
y = features_all[['Density', 'Tg', 'FFV', 'Tc', 'Rg']]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
x_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.transform(X_test)

# Scale targets
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense, Dropout, BatchNormalization, Input, LayerNormalization, GaussianNoise,
    LeakyReLU, ELU, Activation
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import activations, backend as K
import tensorflow as tf

# Custom weighted MAE loss
def weighted_mae_loss(weights):
    weights = K.constant(weights)
    def loss(y_true, y_pred):
        return K.mean(K.abs(y_true - y_pred) * weights)
    return loss

# Helper function for activation layers
def get_activation_layer(name):
    if name == 'leaky_relu':
        return LeakyReLU()
    elif name == 'elu':
        return ELU()
    elif name == 'gelu':
        return Activation(activations.gelu)
    else:
        return Activation(name)  # e.g., 'relu', 'tanh'

# Main model builder
def model_builder(hp):
    model = Sequential()
    model.add(Input(shape=(X_train_scaled.shape[1],)))

    # Optional input noise
    if hp.Boolean('use_input_noise'):
        noise_std = hp.Float('input_noise_std', 0.01, 0.2, step=0.01)
        model.add(GaussianNoise(stddev=noise_std))

    # Tunable number of hidden layers
    num_layers = hp.Int('num_layers', 2, 8)
    for i in range(num_layers):
        units = hp.Int(f'units_{i}', 32, 256, step=32)
        activation_name = hp.Choice(f'activation_{i}', ['relu', 'tanh', 'leaky_relu', 'elu', 'gelu'])
        activation_layer = get_activation_layer(activation_name)

        model.add(Dense(units))
        model.add(activation_layer)

        # Normalization choice per layer
        norm_type = hp.Choice(f'norm_type_{i}', ['none', 'batch', 'layer'])
        if norm_type == 'batch':
            model.add(BatchNormalization())
        elif norm_type == 'layer':
            model.add(LayerNormalization())

        # Optional dropout
        dropout_rate = hp.Float(f'dropout_{i}', 0.0, 0.4, step=0.1)
        if dropout_rate > 0:
            model.add(Dropout(dropout_rate))

    # Output layer for 5 regression targets
    model.add(Dense(5))

    # Tunable output weights for custom loss
    weights = [
        hp.Float('w_density', 0.5, 2.5, step=0.1),
        hp.Float('w_tg',      0.5, 2.5, step=0.1),
        hp.Float('w_ffv',     0.5, 2.5, step=0.1),
        hp.Float('w_tc',      0.5, 2.5, step=0.1),
        hp.Float('w_rg',      0.5, 2.5, step=0.1),
    ]

    # Tunable learning rate
    lr = hp.Float('learning_rate', 1e-5, 1e-3, sampling='log')

    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss=weighted_mae_loss(weights),
        metrics=['mae', tf.keras.metrics.R2Score(class_aggregation="uniform_average")]
    )

    return model


# Bayesian Optimization Tuner
tuner = kt.BayesianOptimization(
    model_builder,
    objective='val_loss',
    max_trials=300,  
    directory='tuner_results',
    project_name='polymer_property_tuning_bayesian',
    seed=42
)

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Run search
tuner.search(
    X_train_scaled, y_train_scaled,
    epochs=200,
    validation_split=0.25,
    callbacks=[early_stop],
    batch_size=64,
    verbose=1
)

# Best hyperparameters
best_hps = tuner.get_best_hyperparameters(1)[0]
print("\nBest Hyperparameters:")
for param, value in best_hps.values.items():
    print(f"{param}: {value}")

# Retrain with best hyperparams
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(
    X_train_scaled, y_train_scaled,
    epochs=200,
    validation_split=0.25,
    callbacks=[early_stop],
    batch_size=64
)

# Evaluate
loss, mae, r2 = best_model.evaluate(X_test_scaled, y_test_scaled)
print(f"\nBest Test Loss: {loss}, Test MAE: {mae}, R^2: {r2}")

Trial 300 Complete [00h 00m 21s]
val_loss: 0.10138080269098282

Best val_loss So Far: 0.09176424145698547
Total elapsed time: 6d 00h 00m 33s

Best Hyperparameters:
use_input_noise: False
num_layers: 2
units_0: 256
activation_0: relu
norm_type_0: layer
dropout_0: 0.2
units_1: 96
activation_1: gelu
norm_type_1: none
dropout_1: 0.0
w_density: 0.5
w_tg: 0.5
w_ffv: 0.5
w_tc: 0.5
w_rg: 0.5
learning_rate: 0.001
input_noise_std: 0.12
units_2: 256
activation_2: tanh
norm_type_2: none
dropout_2: 0.0
units_3: 32
activation_3: leaky_relu
norm_type_3: layer
dropout_3: 0.0
units_4: 96
activation_4: leaky_relu
norm_type_4: layer
dropout_4: 0.0
units_5: 128
activation_5: tanh
norm_type_5: layer
dropout_5: 0.2
units_6: 32
activation_6: leaky_relu
norm_type_6: none
dropout_6: 0.2
units_7: 96
activation_7: tanh
norm_type_7: none
dropout_7: 0.0
Epoch 1/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 2s 9ms/step - loss: 0.3872 - mae: 0.7744 - r2_score: -0.0463 - val_loss: 0.1874 - val_mae: 0.3747 - val_r2_score: 0.7099
Epoch 2/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.2176 - mae: 0.4352 - r2_score: 0.6445 - val_loss: 0.1606 - val_mae: 0.3212 - val_r2_score: 0.7711
Epoch 3/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.1852 - mae: 0.3704 - r2_score: 0.7246 - val_loss: 0.1446 - val_mae: 0.2892 - val_r2_score: 0.8050
Epoch 4/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 0.1659 - mae: 0.3317 - r2_score: 0.7667 - val_loss: 0.1368 - val_mae: 0.2736 - val_r2_score: 0.8226
Epoch 5/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.1544 - mae: 0.3087 - r2_score: 0.7943 - val_loss: 0.1323 - val_mae: 0.2645 - val_r2_score: 0.8328
Epoch 6/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - loss: 0.1464 - mae: 0.2927 - r2_score: 0.8106 - val_loss: 0.1283 - val_mae: 0.2566 - val_r2_score: 0.8397
Epoch 7/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - loss: 0.1378 - mae: 0.2755 - r2_score: 0.8282 - val_loss: 0.1248 - val_mae: 0.2495 - val_r2_score: 0.8467
Epoch 8/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.1316 - mae: 0.2632 - r2_score: 0.8372 - val_loss: 0.1215 - val_mae: 0.2429 - val_r2_score: 0.8549
Epoch 9/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.1265 - mae: 0.2530 - r2_score: 0.8479 - val_loss: 0.1171 - val_mae: 0.2342 - val_r2_score: 0.8599
Epoch 10/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 0.1228 - mae: 0.2456 - r2_score: 0.8560 - val_loss: 0.1183 - val_mae: 0.2366 - val_r2_score: 0.8597
Epoch 11/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 0.1215 - mae: 0.2431 - r2_score: 0.8602 - val_loss: 0.1156 - val_mae: 0.2312 - val_r2_score: 0.8626
Epoch 12/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - loss: 0.1185 - mae: 0.2371 - r2_score: 0.8644 - val_loss: 0.1101 - val_mae: 0.2202 - val_r2_score: 0.8739
Epoch 13/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.1147 - mae: 0.2294 - r2_score: 0.8728 - val_loss: 0.1124 - val_mae: 0.2247 - val_r2_score: 0.8719
Epoch 14/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - loss: 0.1133 - mae: 0.2265 - r2_score: 0.8752 - val_loss: 0.1087 - val_mae: 0.2175 - val_r2_score: 0.8770
Epoch 15/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.1110 - mae: 0.2220 - r2_score: 0.8782 - val_loss: 0.1074 - val_mae: 0.2148 - val_r2_score: 0.8785
Epoch 16/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 0.1095 - mae: 0.2190 - r2_score: 0.8805 - val_loss: 0.1076 - val_mae: 0.2152 - val_r2_score: 0.8775
Epoch 17/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.1076 - mae: 0.2152 - r2_score: 0.8846 - val_loss: 0.1060 - val_mae: 0.2121 - val_r2_score: 0.8820
Epoch 18/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.1065 - mae: 0.2130 - r2_score: 0.8873 - val_loss: 0.1046 - val_mae: 0.2092 - val_r2_score: 0.8809
Epoch 19/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.1043 - mae: 0.2087 - r2_score: 0.8898 - val_loss: 0.1056 - val_mae: 0.2113 - val_r2_score: 0.8835
Epoch 20/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.1033 - mae: 0.2066 - r2_score: 0.8935 - val_loss: 0.1023 - val_mae: 0.2047 - val_r2_score: 0.8854
Epoch 21/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.1010 - mae: 0.2020 - r2_score: 0.8941 - val_loss: 0.1047 - val_mae: 0.2093 - val_r2_score: 0.8843
Epoch 22/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.1014 - mae: 0.2029 - r2_score: 0.8978 - val_loss: 0.1009 - val_mae: 0.2017 - val_r2_score: 0.8906
Epoch 23/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.0991 - mae: 0.1982 - r2_score: 0.8995 - val_loss: 0.1016 - val_mae: 0.2032 - val_r2_score: 0.8907
Epoch 24/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - loss: 0.0980 - mae: 0.1961 - r2_score: 0.9025 - val_loss: 0.1022 - val_mae: 0.2044 - val_r2_score: 0.8887
Epoch 25/200
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.0972 - mae: 0.1944 - r2_score: 0.9031 - val_loss: 0.1017 - val_mae: 0.2035 - val_r2_score: 0.8881
63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - loss: 0.0970 - mae: 0.1940 - r2_score: 0.8987

Best Test Loss: 0.09957732260227203, Test MAE: 0.19915464520454407, R^2: 0.8780268430709839

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score

# Optional: Model Summary
print("\nBest Model Summary:")
best_model.summary()

# 1. Training Curves
def plot_training_curves(history):
    plt.figure(figsize=(14, 5))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Weighted MAE Loss')
    plt.legend()

    # MAE
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Train MAE')
    plt.plot(history.history['val_mae'], label='Val MAE')
    plt.title('MAE over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Mean Absolute Error')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_training_curves(history)

Best Model Summary:

Model: "sequential_1"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense_3 (Dense)                      │ (None, 256)                 │          50,688 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ activation_2 (Activation)            │ (None, 256)                 │               0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ layer_normalization                  │ (None, 256)                 │             512 │
│ (LayerNormalization)                 │                             │                 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dropout (Dropout)                    │ (None, 256)                 │               0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_4 (Dense)                      │ (None, 96)                  │          24,672 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ activation_3 (Activation)            │ (None, 96)                  │               0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_5 (Dense)                      │ (None, 5)                   │             485 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘

 Total params: 229,073 (894.82 KB)

 Trainable params: 76,357 (298.27 KB)

 Non-trainable params: 0 (0.00 B)

 Optimizer params: 152,716 (596.55 KB)

# Predict and inverse scale
y_pred_scaled = best_model.predict(X_test_scaled)

# Inverse transform predictions and actuals
y_pred = y_scaler.inverse_transform(y_pred_scaled)
y_true = y_scaler.inverse_transform(y_test_scaled)

# Column names
targets = ['density', 'tg', 'ffv', 'tc', 'rg']

# Plot predictions
def plot_predicted_vs_actual(y_true, y_pred, target_names):
    plt.figure(figsize=(15, 10))
    for i, target in enumerate(target_names):
        plt.subplot(2, 3, i+1)
        sns.scatterplot(x=y_true[:, i], y=y_pred[:, i], alpha=0.6)
        max_val = max(y_true[:, i].max(), y_pred[:, i].max())
        min_val = min(y_true[:, i].min(), y_pred[:, i].min())
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        plt.title(f'{target} (R² = {r2:.3f})')
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
    plt.tight_layout()
    plt.show()

plot_predicted_vs_actual(y_true, y_pred, targets)

63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step

# Save a copy of the best model and scalers for later use
def save_model_and_scalers(model, x_scaler, y_scaler, base_path):
    """
    Saves a Keras model to .h5 format along with X and y StandardScaler objects.
    
    Args:
        model: Trained Keras model.
        x_scaler: StandardScaler used to scale X features.
        y_scaler: StandardScaler used to scale y targets.
        base_path: Path prefix for saving (e.g., 'models/polymer_model').
                   Files will be saved as:
                       - base_path.h5       (Keras model)
                       - base_path_xscaler.pkl
                       - base_path_yscaler.pkl
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(base_path), exist_ok=True)

    # Save model (.h5)
    model.save(f"{base_path}.h5")
    print(f"Model saved to: {base_path}.h5")

    # Save scalers
    with open(f"{base_path}_xscaler.pkl", 'wb') as xf:
        pickle.dump(x_scaler, xf)
    print(f"X scaler saved to: {base_path}_xscaler.pkl")

    with open(f"{base_path}_yscaler.pkl", 'wb') as yf:
        pickle.dump(y_scaler, yf)
    print(f"Y scaler saved to: {base_path}_yscaler.pkl")

save_model_and_scalers(best_model, x_scaler, y_scaler, base_path='models/polymer_model')

WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`.

Model saved to: models/polymer_model.h5
X scaler saved to: models/polymer_model_xscaler.pkl
Y scaler saved to: models/polymer_model_yscaler.pkl

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw
from rdkit.ML.Descriptors import MoleculeDescriptors
import matplotlib.pyplot as plt

def draw_prediction(smiles, model, x_scaler, y_scaler):
    '''Takes in a SMILES string, model, and scalers, extracts descriptors exactly 
    as done during training, and displays molecule image with predictions.
    
    Parameters:
    -----------
    smiles : str
        SMILES string representing the molecular structure
    model : tensorflow.keras.Model
        Trained model for making predictions
    x_scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler for input features
    y_scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler for target variables
    '''

    # Ensure input is a string
    if not isinstance(smiles, str):
        raise ValueError("Input must be a SMILES string.")

    # Validate SMILES string early
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES string: '{smiles}'. Please provide a valid molecular structure.")

    # Descriptor setup - EXACTLY as in training
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

    def compute_descriptors(smiles):
        mol = Chem.MolFromSmiles(smiles) 
        if mol is None:
            return [np.nan] * len(descriptor_names)
        descriptors = list(calculator.CalcDescriptors(mol))
        # Replace infinite values with NaN 
        #descriptors = [np.nan if np.isinf(x) else round(x, 4) for x in descriptors]
        return descriptors

    # Compute descriptors
    descriptor_data = compute_descriptors(smiles)
    descriptor_df = pd.DataFrame([descriptor_data], columns=descriptor_names)

    # Drop problematic descriptors 
    PROBLEMATIC_DESCRIPTORS = [
        'NumRadicalElectrons', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
        'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
        'SMR_VSA8', 'SlogP_VSA9', 'fr_barbitur', 'fr_benzodiazepine', 'fr_dihydropyridine',
        'fr_epoxide', 'fr_isothiocyan', 'fr_lactam', 'fr_nitroso', 'fr_prisulfonamd', 'fr_thiocyan'
    ]
    
    for col in PROBLEMATIC_DESCRIPTORS:
        if col in descriptor_df.columns:
            descriptor_df.drop(columns=col, inplace=True)

    # Log-transform 'Ipc' 
    if 'Ipc' in descriptor_df.columns:
        descriptor_df['Ipc'] = np.log(descriptor_df['Ipc'])

    # Handle infinite values and NaN - EXACTLY as in training
    # descriptor_df = descriptor_df.replace([np.inf, -np.inf], np.nan)
    
    # Check if any NaN values remain after preprocessing
    if descriptor_df.isna().any().any():
        print("Warning: NaN values detected after preprocessing. This may affect prediction quality.")
        print("NaN columns:", descriptor_df.columns[descriptor_df.isna().any()].tolist())
        # Fill NaN with 0 or mean - this is a fallback since training data had no NaN
        # descriptor_df = descriptor_df.fillna(0)

    # Make predictions
    try:
        descriptor_df_trans = x_scaler.transform(descriptor_df)
        prediction = model.predict(descriptor_df_trans, verbose=0)
        prediction = y_scaler.inverse_transform(prediction.reshape(1,-1))
    except Exception as e:
        print(f"Error during prediction: {e}")
        print("Descriptor shape:", descriptor_df.shape)
        print("Expected features:", len(x_scaler.feature_names_in_) if hasattr(x_scaler, 'feature_names_in_') else "Unknown")
        raise

    # Display predictions in elegant format
    properties = {
        'Density': {'value': prediction[0][0], 'unit': 'g/cm³'},
        'Tg': {'value': prediction[0][1], 'unit': '°C'},
        'FFV': {'value': prediction[0][2], 'unit': ''},
        'Tc': {'value': prediction[0][3], 'unit': 'W/(m·K)'},
        'Rg': {'value': prediction[0][4], 'unit': 'ÅÅ'}
    }
    
    print("\n" + "="*50)
    print(" "*15 + "PROPERTY PREDICTIONS")
    print("="*50)
    
    for prop_name, prop_data in properties.items():
        value = prop_data['value']
        unit = prop_data['unit']
        if unit:
            print(f"{prop_name:>12}: {value:>10.4f} {unit}")
        else:
            print(f"{prop_name:>12}: {value:>10.4f}")
    
    print("="*50 + "\n")

    # Draw molecule with error handling
    try:
        img = Draw.MolToImage(mol)
        plt.figure(figsize=(8, 6))
        plt.imshow(img)
        plt.axis('off')
        plt.title(f'Molecular Structure\nSMILES: {smiles}', fontsize=12, pad=20)
        plt.show()
    except Exception as e:
        raise RuntimeError(f"Failed to generate molecular image: {str(e)}")

# Generate predictions
predictions = best_model.predict(X_test_scaled)

# Examine a single prediction
prediction = y_scaler.inverse_transform(predictions[0].reshape(1,-1))
prediction[0]

63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step

array([ 0.912781  , 50.629364  ,  0.38949415,  0.34004   , 17.554157  ],
      dtype=float32)

# Examine Property Predictions alongside SMILES and Molecular Structure
draw_prediction(data.SMILES[0], model=best_model, x_scaler=x_scaler, y_scaler=y_scaler)

Warning: NaN values detected after preprocessing. This may affect prediction quality.
NaN columns: ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge']

==================================================
               PROPERTY PREDICTIONS
==================================================
     Density:     1.0780 g/cm³
          Tg:   120.3645 °C
         FFV:     0.3671
          Tc:     0.2448 W/(m·K)
          Rg:    18.7984 ÅÅ
==================================================

	Tg	FFV	Tc	Density	Rg
count	511.0000	7030.0000	737.0000	613.0000	614.0000
mean	96.4523	0.3672	0.2563	0.9855	16.4198
std	111.2283	0.0296	0.0895	0.1462	4.6086
min	-148.0297	0.2270	0.0465	0.7487	9.7284
25%	13.6745	0.3495	0.1860	0.8902	12.5403
50%	74.0402	0.3643	0.2360	0.9482	15.0522
75%	161.1476	0.3808	0.3305	1.0621	20.4111
max	472.2500	0.7771	0.5240	1.8410	34.6729

	SMILES	Tg	FFV	Tc	Density	Rg	MaxAbsEStateIndex	MaxEStateIndex	MinAbsEStateIndex	MinEStateIndex	...	fr_quatN	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiophene	fr_unbrch_alkane	fr_urea
0	CC()c1ccccc1C(=O)OCCCCCC	NaN	0.3746	0.2057	NaN	NaN	12.1445	12.1445	0.1059	-0.1059	...	0	0	0	0	0	0	0	0	3	0
1	*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...	NaN	0.3704	NaN	NaN	NaN	3.5234	3.5234	0.0989	0.0989	...	0	0	0	0	0	0	0	0	2	0
2	*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...	NaN	0.3789	NaN	NaN	NaN	13.7147	13.7147	0.1074	-3.8294	...	0	0	0	2	0	0	0	0	0	0
3	Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N)...	NaN	0.3873	NaN	NaN	NaN	3.9787	3.9787	0.0546	-0.2021	...	0	0	0	0	0	0	0	0	0	0
4	*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...	NaN	0.3555	NaN	NaN	NaN	13.7032	13.7032	0.0681	-0.6863	...	0	0	0	0	0	0	0	0	12	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7968	Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C()=O)c2)c1	NaN	0.3675	NaN	NaN	NaN	12.5223	12.5223	0.1724	-0.4362	...	0	0	0	0	0	0	0	0	5	0
7969	*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...	NaN	0.3533	NaN	NaN	NaN	13.6794	13.6794	0.0058	-0.7297	...	0	0	0	0	0	0	0	0	0	0
7970	*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...	NaN	0.3694	NaN	NaN	NaN	13.5556	13.5556	0.1938	-0.6123	...	0	0	0	0	0	0	0	0	5	0
7971	C=C()c1ccccc1C	261.6624	NaN	NaN	NaN	NaN	2.5023	2.5023	0.3962	0.3962	...	0	0	0	0	0	0	0	0	0	0
7972	*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...	NaN	0.3740	NaN	NaN	NaN	12.0202	12.0202	0.1164	-0.1538	...	0	0	0	0	0	0	0	0	17	0

Project Overview¶

Project Goals¶

Methodology¶

About the Data¶

Data Dictionary¶

Data Challenges¶

1. Setup and Configuration¶

2. Data Loading and Initial Exploration¶

3. Data Preprocessing and Cleaning¶

4. Extract Molecular Descriptors from SMILES¶

5. Tune Neural Network for Predicting the Polymer Properties¶

6. Conclusion and Future Work¶

	id	SMILES	Tg	FFV	Tc	Density	Rg
0	87817	CC()c1ccccc1C(=O)OCCCCCC	NaN	0.3746	0.2057	NaN	NaN
1	106919	*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...	NaN	0.3704	NaN	NaN	NaN
2	388772	*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...	NaN	0.3789	NaN	NaN	NaN
3	519416	Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N)...	NaN	0.3873	NaN	NaN	NaN
4	539187	*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...	NaN	0.3555	NaN	NaN	NaN