# Data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Load the loan application dataset from CSV file
data = pd.read_csv('loan_data.csv')

# Display the first few rows of the dataset
data.head()

# Improved visualization function that combines and enhances the previous functions
def plot_features(data, target, features, feature_type='numerical'):
    """
    Plots feature distributions and relationships with target variable.
    
    Parameters:
        data (pd.DataFrame): The DataFrame containing the features.
        target (str): The target variable name.
        features (list): List of feature names to plot.
        feature_type (str): Type of features - 'numerical' or 'categorical'.
    """
    num_features = len(features)
    
    # Set figure size dynamically based on the number of features
    fig, axes = plt.subplots(num_features, 2, figsize=(12, 4 * num_features))
    
    if num_features == 1:
        axes = [axes]  # Ensure axes is iterable when there's only one feature
    
    for i, feature in enumerate(features):
        # Left plot: relationship with target
        if feature_type == 'numerical':
            sns.scatterplot(x=data[feature], y=data[target], ax=axes[i][0])
            axes[i][0].set_title(f'Relationship: {feature} vs {target}')
        else:  # categorical
            sns.barplot(x=data[feature], y=data[target], ax=axes[i][0])
            axes[i][0].set_title(f'Mean {target} by {feature}')
            if data[feature].nunique() > 5:  # Rotate x-labels if many categories
                axes[i][0].set_xticklabels(axes[i][0].get_xticklabels(), rotation=45, ha='right')
        
        # Right plot: distribution
        sns.histplot(data[feature], kde=(feature_type=='numerical'), ax=axes[i][1])
        axes[i][1].set_title(f'Distribution of {feature}')
    
    plt.tight_layout()
    plt.show()

# Wrapper functions for backward compatibility
def plot_feature_distributions(data, target, features):
    """Wrapper for plot_features with numerical type."""
    return plot_features(data, target, features, feature_type='numerical')

def plot_cat_features(data, target, features):
    """Wrapper for plot_features with categorical type."""
    return plot_features(data, target, features, feature_type='categorical')

def calculate_outlier_table(df, columns):
    """Return a DataFrame summarizing outlier percentages for specified columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list of str): List of column names to check for outliers.

    Returns:
        pd.DataFrame: Prettified table with outlier percentages for each column.
    """
    results = []
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            total = len(df)
            left_outliers = df[df[col] < lower_bound]
            right_outliers = df[df[col] > upper_bound]

            left_pct = (len(left_outliers) / total) * 100 if total > 0 else 0
            right_pct = (len(right_outliers) / total) * 100 if total > 0 else 0
            total_pct = left_pct + right_pct

            results.append({
                'Column': col,
                'Total Outliers (%)': round(total_pct, 2),
                'Lower Outliers (%)': round(left_pct, 2),
                'Upper Outliers (%)': round(right_pct, 2)
            })
        else:
            results.append({
                'Column': col,
                'Total Outliers (%)': None,
                'Lower Outliers (%)': None,
                'Upper Outliers (%)': None
            })

    # Create DataFrame from results
    result_df = pd.DataFrame(results)
    
    # Apply styling to the DataFrame
    styled_df = result_df.style.background_gradient(
        cmap='YlOrRd', 
        subset=['Total Outliers (%)']
    ).format({
        'Total Outliers (%)': '{:.2f}%',
        'Lower Outliers (%)': '{:.2f}%',
        'Upper Outliers (%)': '{:.2f}%'
    }).set_properties(**{
        'text-align': 'center',
        'border': '1px solid gray',
        'padding': '5px'
    }).set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#f2f2f2'), 
                                    ('color', 'black'),
                                    ('font-weight', 'bold'),
                                    ('text-align', 'center'),
                                    ('border', '1px solid gray'),
                                    ('padding', '5px')]},
        {'selector': 'caption', 'props': [('caption-side', 'top'), 
                                         ('font-size', '1.2em'),
                                         ('font-weight', 'bold')]}
    ]).set_caption('Outlier Analysis')
    
    return styled_df

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_models(models, X_test, y_test):
    """
    Evaluates multiple trained models on test data and returns a styled comparison table.
    
    Parameters:
        models (dict): A dictionary where keys are model names and values are trained sklearn models.
        X_test (pd.DataFrame or np.array): Test feature set.
        y_test (pd.Series or np.array): True labels for the test set.
        
    Returns:
        pd.DataFrame: Styled DataFrame with model performance metrics.
    """
    results = []
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        
        # Calculate multiple metrics for a more comprehensive evaluation
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results.append([name, rmse, mae, r2])
    
    # Create DataFrame for better visualization
    results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R²"])
    
    # Sort by RMSE (lower is better)
    results_df = results_df.sort_values(by="RMSE")
    
    # Apply styling to highlight the best model
    styled_df = results_df.style.highlight_min(subset=["RMSE", "MAE"], color="lightgreen") \
                                .highlight_max(subset=["R²"], color="lightgreen") \
                                .format({"RMSE": "{:.4f}", "MAE": "{:.4f}", "R²": "{:.4f}"}) \
                                .set_caption("Model Performance Comparison") \
                                .set_properties(**{"text-align": "center", "border": "1px solid gray", "padding": "5px"}) \
                                .set_table_styles([{"selector": "th", "props": [("background-color", "#f2f2f2"), 
                                                                              ("color", "black"),
                                                                              ("font-weight", "bold"),
                                                                              ("text-align", "center"),
                                                                              ("border", "1px solid gray"),
                                                                              ("padding", "5px")]}])
    
    return styled_df

import warnings
warnings.filterwarnings("ignore")
import shap

def plot_shap_bar(model, X_train, top_n=10, plot_type='bar', sample_size=None):
    """
    Plots SHAP feature importance as a bar chart or summary plot.

    Parameters:
    - model: Trained model (supports SHAP values like tree-based models or has a wrapper)
    - X_train: Feature matrix used for training
    - top_n: Number of top features to display (default is 10)
    - plot_type: Type of plot to generate ('bar', 'summary', or 'both')
    - sample_size: Number of samples to use for SHAP calculation (None = all samples)

    Returns:
    - Visualization of feature importances using SHAP values
    """
    # Use a sample of data if specified (for large datasets)
    if sample_size is not None and sample_size < len(X_train):
        X_sample = X_train.sample(sample_size, random_state=42)
    else:
        X_sample = X_train
    
    # Compute SHAP values
    explainer = shap.Explainer(model, X_sample)
    shap_values = explainer(X_sample, check_additivity=False)

    # Get mean absolute SHAP values for each feature
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    
    # Create DataFrame of feature importance
    feature_importance = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_importance})
    
    # Sort by importance
    feature_importance = feature_importance.sort_values(by="SHAP Importance", ascending=False)
    top_features = feature_importance.head(top_n)
    
    if plot_type in ['bar', 'both']:
        # Plot as a bar chart
        plt.figure(figsize=(10, 6))
        bars = plt.barh(top_features['Feature'], top_features['SHAP Importance'], color='royalblue')
        plt.gca().invert_yaxis()  # Highest importance at the top
        
        # Add value labels to the bars
        for bar in bars:
            width = bar.get_width()
            plt.text(width * 1.01, bar.get_y() + bar.get_height()/2, 
                    f'{width:.4f}', va='center')
            
        plt.xlabel("Mean |SHAP Value|")
        plt.ylabel("Feature")
        plt.title("SHAP Feature Importance (Bar Plot)")
        plt.tight_layout()
        plt.show()
    
    if plot_type in ['summary', 'both']:
        # Create a summary plot for the top features
        plt.figure(figsize=(10, 6))
        top_indices = [list(X_train.columns).index(feat) for feat in top_features['Feature']]
        shap.summary_plot(shap_values.values[:, top_indices], 
                         X_sample.iloc[:, top_indices], 
                         feature_names=top_features['Feature'].tolist(),
                         show=False)
        plt.title("SHAP Summary Plot (Feature Impact on Prediction)")
        plt.tight_layout()
        plt.show()
        
    # Return the feature importance DataFrame for further analysis
    return feature_importance

# Load the loan application dataset from CSV file
data = pd.read_csv('loan_data.csv')
# Display the last 5 rows of the dataset to inspect its structure
data.tail()

data.describe()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 45 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         25000 non-null  int64  
 1   AnnualIncome                25000 non-null  int64  
 2   CreditScore                 25000 non-null  int64  
 3   EmploymentStatus            25000 non-null  object 
 4   EducationLevel              25000 non-null  object 
 5   Experience                  25000 non-null  int64  
 6   LoanAmount                  25000 non-null  int64  
 7   LoanDuration                25000 non-null  int64  
 8   HomeOwnershipStatus         25000 non-null  object 
 9   MonthlyDebtPayments         25000 non-null  int64  
 10  CreditCardUtilizationRate   25000 non-null  float64
 11  NumberOfOpenCreditLines     25000 non-null  int64  
 12  NumberOfCreditInquiries     25000 non-null  int64  
 13  BankruptcyHistory           25000 non-null  int64  
 14  LoanPurpose                 25000 non-null  object 
 15  PreviousLoanDefaults        25000 non-null  int64  
 16  PaymentHistory              25000 non-null  int64  
 17  LengthOfCreditHistory       25000 non-null  int64  
 18  SavingsAccountBalance       25000 non-null  int64  
 19  CheckingAccountBalance      25000 non-null  int64  
 20  TotalAssets                 25000 non-null  int64  
 21  TotalLiabilities            25000 non-null  int64  
 22  MonthlyIncome               25000 non-null  float64
 23  UtilityBillsPaymentHistory  25000 non-null  float64
 24  JobTenure                   25000 non-null  int64  
 25  MultipleIncomeSources       25000 non-null  int64  
 26  Delinquency30               25000 non-null  int64  
 27  Delinquency60               25000 non-null  int64  
 28  Delinquency90               25000 non-null  int64  
 29  RecencyLastMissedPayment    25000 non-null  int64  
 30  PctRevolvingDebt            25000 non-null  float64
 31  StudentLoanDebt             25000 non-null  int64  
 32  PaymentToMinimumRatio       25000 non-null  float64
 33  CreditUtilization           25000 non-null  float64
 34  NetWorth                    25000 non-null  int64  
 35  PctInstallmentDebt          25000 non-null  float64
 36  SecuredDebt                 25000 non-null  float64
 37  UnsecuredDebt               25000 non-null  float64
 38  SecuredUnsecuredRatio       25000 non-null  float64
 39  BankruptcyRecencyMonths     25000 non-null  int64  
 40  DefaultsRecencyMonths       25000 non-null  int64  
 41  InterestRate                25000 non-null  float64
 42  MonthlyLoanPayment          25000 non-null  float64
 43  TotalDebtToIncomeRatio      25000 non-null  float64
 44  RiskScore                   25000 non-null  float64
dtypes: float64(14), int64(27), object(4)
memory usage: 8.6+ MB

### Convert binary values to categorical
data['LoanDuration'] = data['LoanDuration'].astype('category')
data['BankruptcyHistory'] = data['BankruptcyHistory'].astype('category')
data['PreviousLoanDefaults'] = data['PreviousLoanDefaults'].astype('category')
data['MultipleIncomeSources'] = data['MultipleIncomeSources'].astype('category')
data['StudentLoanDebt'] = data['StudentLoanDebt'].astype('category')

numerical_features = []

categorical_features = []

for col in data.columns:
    if (data[col].dtype == 'int64') or (data[col].dtype == 'float64'):
        numerical_features.append(col)
    else:
        categorical_features.append(col)

plot_feature_distributions(data, 'RiskScore', numerical_features)

log_features = ['AnnualIncome', 'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets', 'TotalLiabilities',
               'MonthlyIncome', 'NetWorth', 'MonthlyLoanPayment', 'SecuredDebt', 'UnsecuredDebt']

new_log_features = []

for feature in log_features:
    data[f'{feature}_log'] = np.log(data[feature])
    new_log_features.append(f'{feature}_log')

new_log_features

['AnnualIncome_log',
 'SavingsAccountBalance_log',
 'CheckingAccountBalance_log',
 'TotalAssets_log',
 'TotalLiabilities_log',
 'MonthlyIncome_log',
 'NetWorth_log',
 'MonthlyLoanPayment_log',
 'SecuredDebt_log',
 'UnsecuredDebt_log']

# Log Features after transformation
plot_feature_distributions(data, 'RiskScore', new_log_features)

plot_cat_features(data, 'RiskScore', categorical_features)

# Remove non-transformed income and other financials
data.drop(log_features, axis=1, inplace=True)

### Examine Correlation of Features
data.corr(numeric_only=True).style.background_gradient(cmap='plasma')

### Drop Annual Income, retain Montly Income due to high colinearity
### Drop Age for Work Experience dur to high colinearity
### Drop Total Liabilities as it is the same as Secured Debt + Unsecured Debt
data.drop(['Age', 'AnnualIncome_log', 'SavingsAccountBalance_log', 'CheckingAccountBalance_log', 'TotalLiabilities_log'], axis=1, inplace=True)

X = data.drop(['RiskScore'], axis=1)
y = data['RiskScore']
X.head()

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Parameters:
# - test_size=.25: Reserve 25% of data for testing
# - random_state=34: Set seed for reproducibility
# - stratify=y: Ensure target class distribution is preserved in both sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=34)

numerical_features = []

categorical_features = []

datatime_features = []

for col in X.columns:
    if (X[col].dtype == 'int64') or (X[col].dtype == 'float64'):
        numerical_features.append(col)
    elif (X[col].dtype == 'object') or (X[col].dtype == 'category'):
        categorical_features.append(col)
    else:
        datatime_features.append(col)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


# Define transformations
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Fit and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)

# Ensure transformed_X_train is a dense array (important for sparse outputs)
if hasattr(X_train_transformed, "toarray"):
    X_train_transformed = X_train_transformed.toarray()

# Get numerical feature names after transformation (they remain the same)
numerical_columns = numerical_features

# Get categorical feature names from the OneHotEncoder
encoded_columns = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)

# Combine all feature names
all_columns = np.concatenate([numerical_columns, encoded_columns])

# Create DataFrame with correct column names
X_train_transformed = pd.DataFrame(X_train_transformed, columns=all_columns)

# Fit and transform the data
X_test_transformed = preprocessor.transform(X_test)

# Ensure transformed_X_train is a dense array (important for sparse outputs)
if hasattr(X_test_transformed, "toarray"):
    X_test_transformed = X_test_transformed.toarray()

# Get numerical feature names after transformation (they remain the same)
numerical_columns = numerical_features

# Get categorical feature names from the OneHotEncoder
encoded_columns = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)

# Combine all feature names
all_columns = np.concatenate([numerical_columns, encoded_columns])

# Create DataFrame with correct column names
X_test_transformed = pd.DataFrame(X_test_transformed, columns=all_columns)

### Train Base Models
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

lm = LinearRegression()
svr = SVR()
sgd = SGDRegressor(random_state=42)
rf = RandomForestRegressor(random_state=12)
mlp = MLPRegressor(max_iter=2000, verbose=False, early_stopping=True)
xgb = XGBRegressor(eval_metric='logloss', use_label_encoder=False, random_state=42,verbose=0)

models = [lm, svr, sgd, rf, mlp, xgb]


for model in models:
    model.fit(X_train_transformed, y_train)

trained_models = {
                  "Random Forest": rf, 
                  "SVR": svr,
                  "Linear Regression": lm, 
                  "SGD": sgd,
                  "MLP": mlp,
                  "XGBoost":xgb,
                 }
evaluate_models(trained_models, X_test_transformed, y_test)

from sklearn.model_selection import GridSearchCV

params = {
        'n_estimators': [100, 150, 200, 250, 300],
        'max_depth': [3, 5, 7, 13],
        'learning_rate': [.1, .01, .001]
}

xgb = XGBRegressor(random_state=42)

xgb_grid = GridSearchCV(
    estimator=xgb,
    param_grid=params,
    cv=7,
    verbose=0
)

xgb_grid.fit(X_train_transformed, y_train)

y_pred = xgb_grid.predict(X_test_transformed)

rsme = np.sqrt(mean_squared_error(y_test, y_pred))

print("")
print(f'The best model features: {xgb_grid.best_params_}, with a RSME of {rsme}')
print("")

The best model features: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}, with a RSME of 0.0479776212766889

import shap
import numpy as np
import matplotlib.pyplot as plt

def plot_shap_bar(model, X_train, top_n=10):
    """
    Plots SHAP feature importance as a bar chart.

    Parameters:
    - model: Trained model (supports SHAP values like tree-based models or has a wrapper)
    - X_train: Feature matrix used for training
    - top_n: Number of top features to display (default is 10)

    Returns:
    - A horizontal bar chart showing the top feature importances.
    """
    # Compute SHAP values
    explainer = shap.Explainer(model, X_train)
    shap_values = explainer(X_train, check_additivity=False)

    # Get mean absolute SHAP values for each feature
    shap_importance = np.abs(shap_values.values).mean(axis=0)
    
    # Create DataFrame of feature importance
    feature_importance = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_importance})
    
    # Sort by importance
    feature_importance = feature_importance.sort_values(by="SHAP Importance", ascending=False).head(top_n)

    # Plot as a bar chart
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['SHAP Importance'], color='royalblue')
    plt.gca().invert_yaxis()  # Highest importance at the top
    plt.xlabel("Mean |SHAP Value|")
    plt.ylabel("Feature")
    plt.title("SHAP Feature Importance (Bar Plot)")
    plt.show()

xgb_reg = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=300)
xgb_reg.fit(X_train_transformed, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=300,
             n_jobs=None, num_parallel_tree=None, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=300,
             n_jobs=None, num_parallel_tree=None, ...)

## SHAP feature importance XGBoost
plot_shap_bar(xgb_reg, X_train_transformed)

100%|===================| 18676/18750 [01:58<00:00]

	Age	AnnualIncome	CreditScore	EmploymentStatus	EducationLevel	Experience	LoanAmount	LoanDuration	HomeOwnershipStatus	MonthlyDebtPayments	...	PctInstallmentDebt	SecuredDebt	UnsecuredDebt	SecuredUnsecuredRatio	DefaultsRecencyMonths	InterestRate	MonthlyLoanPayment	TotalDebtToIncomeRatio	RiskScore
0	45	24779	476	Employed	High School	22	17948	24	Rent	314	...	0.3496	5829.2647	3185.7352	1.8298	0	0.2598	966.83	0.6202	0.940
1	38	36803	566	Employed	Associate	15	9532	36	Mortgage	394	...	0.8140	7881.3310	4638.6689	1.6990	0	0.2309	369.47	0.2489	0.830
2	47	101673	618	Employed	Master	25	11082	60	Other	492	...	0.7561	14896.5611	4212.4388	3.5363	0	0.2046	296.49	0.0930	0.632
3	58	88304	688	Employed	Master	36	12810	12	Mortgage	342	...	0.4690	7551.6435	2709.3564	2.7872	0	0.1416	1151.17	0.2029	0.640
4	37	18607	601	Employed	High School	14	27331	36	Other	277	...	0.7074	10018.4605	5360.5394	1.8689	30	0.2249	1050.75	0.8562	0.920

	Age	AnnualIncome	CreditScore	EmploymentStatus	EducationLevel	Experience	LoanAmount	LoanDuration	HomeOwnershipStatus	MonthlyDebtPayments	...	PctInstallmentDebt	SecuredDebt	UnsecuredDebt	SecuredUnsecuredRatio	InterestRate	MonthlyLoanPayment	TotalDebtToIncomeRatio	RiskScore
24995	32	77950	515	Employed	Bachelor	9	30081	12	Other	330	...	0.8420	15875.6960	7277.3039	2.1815	0.2235	2820.56	0.4850	0.820
24996	36	60009	569	Employed	High School	14	9543	12	Own	383	...	0.5364	16950.8734	2549.1265	6.6496	0.1751	872.69	0.2511	0.688
24997	51	106104	592	Employed	High School	27	27377	36	Mortgage	357	...	0.6687	4017.8071	5394.1928	0.7448	0.1898	1003.32	0.1538	0.800
24998	47	45132	612	Employed	Master	24	21589	48	Mortgage	633	...	0.5353	22550.6980	2249.3019	10.0256	0.1893	644.72	0.3397	0.840
24999	40	54040	623	Self-Employed	Master	14	8497	24	Mortgage	385	...	0.4662	5170.2935	5171.7064	0.9997	0.1852	426.36	0.1801	0.704

	Age	AnnualIncome	CreditScore	Experience	LoanAmount	LoanDuration	MonthlyDebtPayments	CreditCardUtilizationRate	NumberOfOpenCreditLines	NumberOfCreditInquiries	...	PctInstallmentDebt	SecuredDebt	UnsecuredDebt	SecuredUnsecuredRatio	BankruptcyRecencyMonths	DefaultsRecencyMonths	InterestRate	MonthlyLoanPayment	TotalDebtToIncomeRatio	RiskScore
count	25000.000000	25000.000000	25000.000000	25000.000000	25000.00000	25000.000000	25000.000000	25000.000000	25000.000000	25000.000000	...	25000.000000	25000.000000	2.500000e+04	25000.000000	25000.000000	25000.000000	25000.000000	25000.000000	25000.000000	25000.000000
mean	39.690480	59067.206200	571.405960	17.453880	24867.10152	42.567840	455.266240	0.286841	2.989640	1.008720	...	0.715763	25975.485086	1.054389e+04	4.994234	0.590080	1.711760	0.229703	1057.883414	0.447298	0.827551
std	11.589918	40428.175718	51.433467	11.323697	13174.53025	22.489618	241.305423	0.160384	1.724497	1.005089	...	0.159775	35511.278579	1.996721e+04	12.812440	3.649737	7.492981	0.040941	776.343951	0.385582	0.120264
min	18.000000	15000.000000	359.000000	0.000000	2841.00000	12.000000	49.000000	0.001998	0.000000	0.000000	...	0.062400	181.106300	4.373000e+00	0.087400	0.000000	0.000000	0.115500	87.850000	0.015700	0.480000
25%	31.000000	31647.500000	540.000000	9.000000	15712.75000	24.000000	286.000000	0.161512	2.000000	0.000000	...	0.610500	7581.818025	2.351149e+03	1.562975	0.000000	0.000000	0.200700	561.795000	0.199100	0.744000
50%	40.000000	48017.500000	578.000000	17.000000	21933.00000	36.000000	403.000000	0.265969	3.000000	1.000000	...	0.737600	15299.118450	5.390328e+03	2.745500	0.000000	0.000000	0.225600	841.795000	0.333000	0.840000
75%	48.000000	73933.000000	609.000000	25.000000	30736.25000	48.000000	564.000000	0.390371	4.000000	2.000000	...	0.841100	30539.138150	1.181989e+04	5.140450	0.000000	0.000000	0.254800	1290.762500	0.567225	0.910000
max	80.000000	366002.000000	726.000000	62.000000	134217.00000	120.000000	2670.000000	0.868611	12.000000	7.000000	...	0.997000	881537.812600	1.622216e+06	1009.609900	102.000000	139.000000	0.467000	12460.580000	5.727200	1.260000

	Age	CreditScore	Experience	LoanAmount	MonthlyDebtPayments	CreditCardUtilizationRate	NumberOfOpenCreditLines	NumberOfCreditInquiries	PaymentHistory	LengthOfCreditHistory	UtilityBillsPaymentHistory	JobTenure	Delinquency30	Delinquency60	Delinquency90	RecencyLastMissedPayment	PctRevolvingDebt	PaymentToMinimumRatio	CreditUtilization	PctInstallmentDebt	SecuredUnsecuredRatio	BankruptcyRecencyMonths	DefaultsRecencyMonths	InterestRate	TotalDebtToIncomeRatio	RiskScore	AnnualIncome_log	SavingsAccountBalance_log	CheckingAccountBalance_log	TotalAssets_log	TotalLiabilities_log	MonthlyIncome_log	NetWorth_log	MonthlyLoanPayment_log	SecuredDebt_log	UnsecuredDebt_log
Age	1.000000	0.318582	0.982907	0.016881	-0.004676	0.002834	-0.012833	-0.008717	0.011631	0.002426	-0.007503	0.002857	-0.002570	-0.003908	0.002548	0.003020	-0.009505	-0.010411	-0.011043	0.009505	0.007011	-0.009174	-0.003960	-0.199963	-0.098541	-0.128763	0.148753	-0.011076	0.001108	-0.010112	0.003062	0.149212	-0.006903	-0.005411	0.003870	0.000734
CreditScore	0.318582	1.000000	0.325080	0.009672	-0.004154	-0.010567	-0.005394	0.012732	0.007065	-0.004425	-0.007746	0.003871	0.000100	0.000545	0.002092	-0.003632	-0.004064	0.003144	0.000129	0.004064	0.004015	-0.007447	0.001087	-0.623913	-0.096970	-0.183187	0.118713	-0.003073	0.004708	-0.008837	-0.000826	0.119118	-0.002576	-0.051282	-0.001363	0.001098
Experience	0.982907	0.325080	1.000000	0.015847	-0.006473	0.002137	-0.013242	-0.008692	0.011012	0.000782	-0.008315	0.003168	-0.001829	-0.004611	0.001153	0.001986	-0.009087	-0.008493	-0.009524	0.009087	0.008297	-0.010097	-0.002789	-0.203203	-0.100427	-0.130458	0.150518	-0.010324	0.000358	-0.010468	0.002750	0.150927	-0.007331	-0.006645	0.003498	0.000467
LoanAmount	0.016881	0.009672	0.015847	1.000000	-0.004930	0.001536	0.005456	0.005357	-0.019825	-0.003033	0.005997	0.006721	-0.002725	0.003000	0.007395	0.004192	-0.000488	0.003176	-0.009358	0.000488	-0.003309	0.006842	0.001043	0.319335	0.439389	0.151938	-0.001080	0.007892	-0.006581	-0.002661	0.005792	-0.001359	-0.000784	0.784665	0.006120	0.004942
MonthlyDebtPayments	-0.004676	-0.004154	-0.006473	-0.004930	1.000000	-0.005905	-0.007812	-0.003328	-0.001800	-0.004903	0.005300	0.010561	0.005081	-0.002481	0.003452	-0.011127	0.020811	-0.005737	-0.009989	-0.020811	0.007288	0.004048	-0.001359	0.001902	0.189699	0.040668	-0.005991	0.007230	-0.010036	-0.010999	0.006462	-0.005838	-0.009160	-0.000706	0.009708	-0.003530
CreditCardUtilizationRate	0.002834	-0.010567	0.002137	0.001536	-0.005905	1.000000	-0.009601	0.004445	-0.010581	0.002243	0.002187	-0.005563	0.001190	-0.002320	-0.001045	0.004662	0.001220	0.001513	-0.009530	-0.001220	0.012447	0.011143	-0.003328	0.001408	-0.004132	0.059330	0.001261	-0.001431	-0.001907	-0.008416	0.000479	0.001035	-0.009047	0.002383	0.001435	-0.003366
NumberOfOpenCreditLines	-0.012833	-0.005394	-0.013242	0.005456	-0.007812	-0.009601	1.000000	0.010691	0.000480	0.012094	-0.000753	0.002758	0.003561	-0.003685	0.011836	-0.003175	0.007075	0.007890	0.002926	-0.007075	-0.008353	-0.001234	-0.008460	0.007601	0.001222	0.001795	-0.000193	-0.003916	-0.001988	0.003586	-0.001491	-0.000842	0.001076	0.004858	-0.002399	0.002836
NumberOfCreditInquiries	-0.008717	0.012732	-0.008692	0.005357	-0.003328	0.004445	0.010691	1.000000	-0.003134	-0.006592	-0.002811	-0.001702	0.010695	0.004497	0.005735	0.002535	-0.003040	0.014358	0.002148	0.003040	-0.000714	0.009775	-0.008643	-0.007950	-0.002473	0.005860	-0.000178	-0.012646	0.007163	-0.007968	0.012511	-0.000877	-0.013124	0.003380	0.012109	0.010752
PaymentHistory	0.011631	0.007065	0.011012	-0.019825	-0.001800	-0.010581	0.000480	-0.003134	1.000000	-0.001172	0.001153	0.005199	-0.001433	-0.005785	-0.009737	0.001616	0.008418	-0.000452	-0.006968	-0.008418	0.017528	-0.004261	0.003465	-0.012266	-0.001667	-0.024516	-0.007308	-0.005017	-0.000353	0.007319	0.008041	-0.007429	0.004532	-0.020573	0.010798	0.000112
LengthOfCreditHistory	0.002426	-0.004425	0.000782	-0.003033	-0.004903	0.002243	0.012094	-0.006592	-0.001172	1.000000	-0.006671	0.000724	-0.000367	0.010877	-0.001448	0.006807	0.013111	-0.014260	0.004670	-0.013111	-0.001994	0.013201	0.002414	0.004474	-0.008733	-0.134843	0.004582	-0.003377	-0.008629	0.002258	0.002703	0.004333	-0.001949	-0.003575	0.002422	0.000968
UtilityBillsPaymentHistory	-0.007503	-0.007746	-0.008315	0.005997	0.005300	0.002187	-0.000753	-0.002811	0.001153	-0.006671	1.000000	0.004598	-0.003293	0.000011	-0.000123	0.000916	-0.007378	0.003945	-0.000699	0.007378	-0.002977	0.003215	-0.001279	0.007733	0.007156	0.003072	-0.007342	-0.000675	0.009873	-0.000751	-0.000974	-0.007157	0.002506	0.001072	-0.002912	0.002848
JobTenure	0.002857	0.003871	0.003168	0.006721	0.010561	-0.005563	0.002758	-0.001702	0.005199	0.000724	0.004598	1.000000	0.008616	0.003294	0.010312	-0.001263	0.001783	-0.003721	-0.007755	-0.001783	-0.011283	0.005508	0.002772	0.006214	0.002509	0.014009	0.000025	0.012228	-0.008503	-0.008090	-0.000766	-0.000455	-0.000969	0.001255	0.000656	-0.000448
Delinquency30	-0.002570	0.000100	-0.001829	-0.002725	0.005081	0.001190	0.003561	0.010695	-0.001433	-0.000367	-0.003293	0.008616	1.000000	0.004417	0.008580	0.000038	0.000972	0.006476	-0.002809	-0.000972	0.004131	0.007954	-0.006125	-0.008486	-0.006124	0.103225	0.006749	0.008785	-0.003193	-0.004740	0.011199	0.005130	-0.006669	0.000080	0.012417	0.007482
Delinquency60	-0.003908	0.000545	-0.004611	0.003000	-0.002481	-0.002320	-0.003685	0.004497	-0.005785	0.010877	0.000011	0.003294	0.004417	1.000000	-0.001852	0.002976	0.005285	-0.004196	-0.004748	-0.005285	0.004067	-0.003403	-0.003343	-0.001818	0.005747	0.141475	-0.002438	-0.001316	-0.005135	0.009084	-0.002869	-0.003042	0.007969	0.004249	-0.000650	-0.008477
Delinquency90	0.002548	0.002092	0.001153	0.007395	0.003452	-0.001045	0.011836	0.005735	-0.009737	-0.001448	-0.000123	0.010312	0.008580	-0.001852	1.000000	-0.000172	0.002626	0.004868	-0.001166	-0.002626	0.004281	-0.000465	-0.007672	0.002272	0.012808	0.123537	-0.010042	-0.010737	0.002079	0.007060	0.008847	-0.010352	-0.002073	0.000915	0.007578	0.007464
RecencyLastMissedPayment	0.003020	-0.003632	0.001986	0.004192	-0.011127	0.004662	-0.003175	0.002535	0.001616	0.006807	0.000916	-0.001263	0.000038	0.002976	-0.000172	1.000000	0.002320	0.002143	-0.008367	-0.002320	0.009112	-0.000731	0.004433	-0.003007	0.001565	-0.071884	-0.002528	-0.002787	-0.000273	0.007131	0.008180	-0.002399	0.002720	0.004339	0.008959	0.004966
PctRevolvingDebt	-0.009505	-0.004064	-0.009087	-0.000488	0.020811	0.001220	0.007075	-0.003040	0.008418	0.013111	-0.007378	0.001783	0.000972	0.005285	0.002626	0.002320	1.000000	-0.002982	0.006842	-1.000000	0.006898	0.017797	0.002383	0.001107	0.009045	0.149982	-0.003063	-0.006705	0.000834	0.009079	-0.003123	-0.003248	0.008107	-0.001299	-0.004310	-0.001997
PaymentToMinimumRatio	-0.010411	0.003144	-0.008493	0.003176	-0.005737	0.001513	0.007890	0.014358	-0.000452	-0.014260	0.003945	-0.003721	0.006476	-0.004196	0.004868	0.002143	-0.002982	1.000000	0.004917	0.002982	0.005392	0.004784	0.004311	-0.006158	0.001980	-0.174677	0.002791	-0.003613	-0.000277	-0.006793	-0.003347	0.002641	-0.007372	0.007021	-0.003508	-0.006621
CreditUtilization	-0.011043	0.000129	-0.009524	-0.009358	-0.009989	-0.009530	0.002926	0.002148	-0.006968	0.004670	-0.000699	-0.007755	-0.002809	-0.004748	-0.001166	-0.008367	0.006842	0.004917	1.000000	-0.006842	0.004783	0.007917	-0.000921	-0.003097	-0.009404	-0.001622	0.001526	0.007187	0.003903	0.000749	0.002815	0.001686	0.000413	-0.007750	0.005492	-0.003525
PctInstallmentDebt	0.009505	0.004064	0.009087	0.000488	-0.020811	-0.001220	-0.007075	0.003040	-0.008418	-0.013111	0.007378	-0.001783	-0.000972	-0.005285	-0.002626	-0.002320	-1.000000	0.002982	-0.006842	1.000000	-0.006898	-0.017797	-0.002383	-0.001107	-0.009045	-0.149982	0.003063	0.006705	-0.000834	-0.009079	0.003123	0.003248	-0.008107	0.001299	0.004310	0.001997
SecuredUnsecuredRatio	0.007011	0.004015	0.008297	-0.003309	0.007288	0.012447	-0.008353	-0.000714	0.017528	-0.001994	-0.002977	-0.011283	0.004131	0.004067	0.004281	0.009112	0.006898	0.005392	0.004783	-0.006898	1.000000	0.006053	0.006985	-0.004130	-0.004560	-0.058957	0.009410	0.000167	-0.004341	-0.001128	0.004037	0.008669	-0.003940	-0.006663	0.079967	-0.324136
BankruptcyRecencyMonths	-0.009174	-0.007447	-0.010097	0.006842	0.004048	0.011143	-0.001234	0.009775	-0.004261	0.013201	0.003215	0.005508	0.007954	-0.003403	-0.000465	-0.000731	0.017797	0.004784	0.007917	-0.017797	0.006053	1.000000	-0.001939	0.005257	-0.004528	0.198990	0.001356	0.009743	0.003234	0.000024	-0.003775	0.002171	0.006660	0.003707	-0.004384	-0.003844
DefaultsRecencyMonths	-0.003960	0.001087	-0.002789	0.001043	-0.001359	-0.003328	-0.008460	-0.008643	0.003465	0.002414	-0.001279	0.002772	-0.006125	-0.003343	-0.007672	0.004433	0.002383	0.004311	-0.000921	-0.002383	0.006985	-0.001939	1.000000	-0.004593	-0.004309	0.155668	0.002312	-0.009815	-0.006717	0.002462	0.003194	0.003167	0.002857	-0.003049	0.002657	0.002698
InterestRate	-0.199963	-0.623913	-0.203203	0.319335	0.001902	0.001408	0.007601	-0.007950	-0.012266	0.004474	0.007733	0.006214	-0.008486	-0.001818	0.002272	-0.003007	0.001107	-0.006158	-0.003097	-0.001107	-0.004130	0.005257	-0.004593	1.000000	0.116514	0.238056	-0.084165	0.004083	-0.000776	0.010905	0.012589	-0.084511	0.004610	0.120626	0.013079	0.008822
TotalDebtToIncomeRatio	-0.098541	-0.096970	-0.100427	0.439389	0.189699	-0.004132	0.001222	-0.002473	-0.001667	-0.008733	0.007156	0.002509	-0.006124	0.005747	0.012808	0.001565	0.009045	0.001980	-0.009404	-0.009045	-0.004560	-0.004528	-0.004309	0.116514	1.000000	0.324635	-0.656498	-0.000162	-0.012028	-0.007619	-0.012661	-0.658751	-0.001950	0.535584	-0.012861	-0.006929
RiskScore	-0.128763	-0.183187	-0.130458	0.151938	0.040668	0.059330	0.001795	0.005860	-0.024516	-0.134843	0.003072	0.014009	0.103225	0.141475	0.123537	-0.071884	0.149982	-0.174677	-0.001622	-0.149982	-0.058957	0.198990	0.155668	0.238056	0.324635	1.000000	-0.468258	0.001250	0.000709	-0.213064	0.036722	-0.469280	-0.207920	0.150482	-0.011480	0.125627
AnnualIncome_log	0.148753	0.118713	0.150518	-0.001080	-0.005991	0.001261	-0.000193	-0.000178	-0.007308	0.004582	-0.007342	0.000025	0.006749	-0.002438	-0.010042	-0.002528	-0.003063	0.002791	0.001526	0.003063	0.009410	0.001356	0.002312	-0.084165	-0.656498	-0.468258	1.000000	0.005840	0.006506	0.004752	0.006725	0.995862	0.005187	-0.006685	0.008287	0.000258
SavingsAccountBalance_log	-0.011076	-0.003073	-0.010324	0.007892	0.007230	-0.001431	-0.003916	-0.012646	-0.005017	-0.003377	-0.000675	0.012228	0.008785	-0.001316	-0.010737	-0.002787	-0.006705	-0.003613	0.007187	0.006705	0.000167	0.009743	-0.009815	0.004083	-0.000162	0.001250	0.005840	1.000000	-0.004918	0.021917	-0.002808	0.005601	0.009639	0.010271	-0.003622	0.001659
CheckingAccountBalance_log	0.001108	0.004708	0.000358	-0.006581	-0.010036	-0.001907	-0.001988	0.007163	-0.000353	-0.008629	0.009873	-0.008503	-0.003193	-0.005135	0.002079	-0.000273	0.000834	-0.000277	0.003903	-0.000834	-0.004341	0.003234	-0.006717	-0.000776	-0.012028	0.000709	0.006506	-0.004918	1.000000	0.004801	0.000992	0.006500	-0.003098	-0.006954	-0.000274	0.001607
TotalAssets_log	-0.010112	-0.008837	-0.010468	-0.002661	-0.010999	-0.008416	0.003586	-0.007968	0.007319	0.002258	-0.000751	-0.008090	-0.004740	0.009084	0.007060	0.007131	0.009079	-0.006793	0.000749	-0.009079	-0.001128	0.000024	0.002462	0.010905	-0.007619	-0.213064	0.004752	0.021917	0.004801	1.000000	0.009329	0.004827	0.837724	-0.008521	0.009791	0.008471
TotalLiabilities_log	0.003062	-0.000826	0.002750	0.005792	0.006462	0.000479	-0.001491	0.012511	0.008041	0.002703	-0.000974	-0.000766	0.011199	-0.002869	0.008847	0.008180	-0.003123	-0.003347	0.002815	0.003123	0.004037	-0.003775	0.003194	0.012589	-0.012661	0.036722	0.006725	-0.002808	0.000992	0.009329	1.000000	0.007847	-0.343585	-0.005729	0.967826	0.818912
MonthlyIncome_log	0.149212	0.119118	0.150927	-0.001359	-0.005838	0.001035	-0.000842	-0.000877	-0.007429	0.004333	-0.007157	-0.000455	0.005130	-0.003042	-0.010352	-0.002399	-0.003248	0.002641	0.001686	0.003248	0.008669	0.002171	0.003167	-0.084511	-0.658751	-0.469280	0.995862	0.005601	0.006500	0.004827	0.007847	1.000000	0.004866	-0.006659	0.009357	0.001415
NetWorth_log	-0.006903	-0.002576	-0.007331	-0.000784	-0.009160	-0.009047	0.001076	-0.013124	0.004532	-0.001949	0.002506	-0.000969	-0.006669	0.007969	-0.002073	0.002720	0.008107	-0.007372	0.000413	-0.008107	-0.003940	0.006660	0.002857	0.004610	-0.001950	-0.207920	0.005187	0.009639	-0.003098	0.837724	-0.343585	0.004866	1.000000	-0.003385	-0.332303	-0.279592
MonthlyLoanPayment_log	-0.005411	-0.051282	-0.006645	0.784665	-0.000706	0.002383	0.004858	0.003380	-0.020573	-0.003575	0.001072	0.001255	0.000080	0.004249	0.000915	0.004339	-0.001299	0.007021	-0.007750	0.001299	-0.006663	0.003707	-0.003049	0.120626	0.535584	0.150482	-0.006685	0.010271	-0.006954	-0.008521	-0.005729	-0.006659	-0.003385	1.000000	-0.006965	-0.001426
SecuredDebt_log	0.003870	-0.001363	0.003498	0.006120	0.009708	0.001435	-0.002399	0.012109	0.010798	0.002422	-0.002912	0.000656	0.012417	-0.000650	0.007578	0.008959	-0.004310	-0.003508	0.005492	0.004310	0.079967	-0.004384	0.002657	0.013079	-0.012861	-0.011480	0.008287	-0.003622	-0.000274	0.009791	0.967826	0.009357	-0.332303	-0.006965	1.000000	0.671173
UnsecuredDebt_log	0.000734	0.001098	0.000467	0.004942	-0.003530	-0.003366	0.002836	0.010752	0.000112	0.000968	0.002848	-0.000448	0.007482	-0.008477	0.007464	0.004966	-0.001997	-0.006621	-0.003525	0.001997	-0.324136	-0.003844	0.002698	0.008822	-0.006929	0.125627	0.000258	0.001659	0.001607	0.008471	0.818912	0.001415	-0.279592	-0.001426	0.671173	1.000000

	CreditScore	EmploymentStatus	EducationLevel	Experience	LoanAmount	LoanDuration	HomeOwnershipStatus	MonthlyDebtPayments	CreditCardUtilizationRate	NumberOfOpenCreditLines	...	DefaultsRecencyMonths	InterestRate	TotalDebtToIncomeRatio	TotalAssets_log	MonthlyIncome_log	NetWorth_log	MonthlyLoanPayment_log	SecuredDebt_log	UnsecuredDebt_log
0	476	Employed	High School	22	17948	24	Rent	314	0.122216	4	...	0	0.2598	0.6202	9.974179	7.632842	9.429476	6.874023	8.670646	8.066438
1	566	Employed	Associate	15	9532	36	Mortgage	394	0.585098	2	...	0	0.2309	0.2489	10.110461	8.028426	9.399224	5.912070	8.972252	8.442183
2	618	Employed	Master	25	11082	60	Other	492	0.129831	0	...	0	0.2046	0.0930	11.109219	9.044610	10.772162	5.692013	9.608886	8.345797
3	688	Employed	Master	36	12810	12	Mortgage	342	0.284061	3	...	0	0.1416	0.2029	10.051563	8.903633	9.467383	7.048534	8.929521	7.904466
4	601	Employed	High School	14	27331	36	Other	277	0.337430	3	...	30	0.2249	0.8562	12.046772	7.346384	11.952270	6.957259	9.212185	8.586820

I. Background¶

II. Import Libraries and define helping functions¶

III. Data Preparation - Inspect and Clean¶

IV. Explore Data and Relationships¶

V. Modeling¶

Fine Tuning Parameters¶

VI. Conclusion¶

	Model	RMSE	MAE	R²
4	MLP	0.0481	0.0369	0.8415
5	XGBoost	0.0514	0.0393	0.8188
0	Random Forest	0.0613	0.0458	0.7429
1	SVR	0.0631	0.0508	0.7274
2	Linear Regression	0.0671	0.0543	0.6918
3	SGD	0.0674	0.0545	0.6891