import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import seaborn as sns
import category_encoders as ce
from scipy import stats 
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc,
    ConfusionMatrixDisplay, RocCurveDisplay
)
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

import warnings
warnings.filterwarnings("ignore")

file_location = "/home/hduser/backup/data/accepted_2007_to_2018Q4.csv.gz"

selected_columns = [
    "id",
    "purpose",
    "term",
    "verification_status",
    "acc_now_delinq",
    "addr_state",
    "annual_inc",
    "application_type",
    "dti",
    "grade",
    "home_ownership",
    "initial_list_status",
    "installment",
    "int_rate",
    "loan_amnt",
    "loan_status",
    "tax_liens",
    "delinq_amnt",
    "pub_rec",
    "last_fico_range_high",
    "last_fico_range_low",
    "recoveries",
    "collection_recovery_fee"
]

# Load only the selected columns (Pandas can read gzip directly)
df = pd.read_csv(file_location, usecols=selected_columns, compression='gzip', low_memory=False)

# Show the first few rows
df.head()

print("Shape of the data frame :",df.shape)

Shape of the data frame : (2260701, 23)

df = df.dropna()

df.dtypes

id                          object
loan_amnt                  float64
term                        object
int_rate                   float64
installment                float64
grade                       object
home_ownership              object
annual_inc                 float64
verification_status         object
loan_status                 object
purpose                     object
addr_state                  object
dti                        float64
pub_rec                    float64
initial_list_status         object
recoveries                 float64
collection_recovery_fee    float64
last_fico_range_high       float64
last_fico_range_low        float64
application_type            object
acc_now_delinq             float64
delinq_amnt                float64
tax_liens                  float64
dtype: object

df.describe()

df["loan_status"].unique()

array(['Fully Paid', 'Current', 'Charged Off', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

df = df[df.loan_status != "Current"]

df["loan_status"].unique()

array(['Fully Paid', 'Charged Off', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

df.drop('id', axis=1, inplace=True)

# all columns
data_encoded = df.copy()
for col in data_encoded.select_dtypes(include=['object']).columns:
    data_encoded[col] = data_encoded[col].astype('category').cat.codes

plt.figure(figsize=(21, 14))
sns.heatmap(data_encoded.corr(), annot=True, cmap='viridis', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap (Numeric + Encoded Categorical Features)", fontsize=18, pad=12)
plt.tight_layout()
plt.show()

Good_Loan_statuses = [
    'Fully Paid',
    'In Grace Period',
    'Does not meet the credit policy. Status:Fully Paid'
]
Bad_Loan_statuses = [
    'Charged Off',
    'Does not meet the credit policy. Status:Charged Off',
    'Late (16-30 days)',
    'Late (31-120 days)',
    'Default'
]

# Filter the DataFrame for the relevant statuses
df_filtered = df[df['loan_status'].isin(Good_Loan_statuses + Bad_Loan_statuses)].copy()

# Create a new column 'target' to group the statuses
df_filtered.loc[:, 'target_status'] = df_filtered['loan_status'].apply(
    lambda x: 'Good Loan' if x in Good_Loan_statuses else 'Bad Loan'
)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.despine()
sns.countplot(data=df_filtered, x='target_status', palette=['lightgreen', 'salmon'])

# Add legend
handles = ax.patches
labels = ['Good Loan', 'Bad Loan']
ax.legend(handles=handles, labels=labels, loc='upper right')

ax.set(xlabel='Loan Status', ylabel='Count')
ax.set_title('Loan Status Count', size=20)
plt.tight_layout()
plt.show()

filtered_df = df.sample(n=100000, random_state=42)

# Define the mapping for "Good Loan" and "Bad Loan"
Good_Loan_statuses = [
    'Fully Paid',
    'In Grace Period',
    'Does not meet the credit policy. Status:Fully Paid'
]

# Update the loan_status column
filtered_df['loan_status'] = filtered_df['loan_status'].apply(
    lambda x: 'Good Loan' if x in Good_Loan_statuses else 'Bad Loan'
)

# Verify the updated values
print(filtered_df["loan_status"].unique())

['Good Loan' 'Bad Loan']

filtered_df.head()

# Visualization for total loan count by state (In the USA)

fig, ax =plt.subplots(figsize=(20,10))
sns.despine()
order = filtered_df["addr_state"].value_counts().index
sns.countplot(data=filtered_df,x="addr_state",order=order)
ax.tick_params(axis='x', labelrotation=90)
ax.set(xlabel='State', ylabel='')
ax.set_title('Loan count by state', size=20)

Text(0.5, 1.0, 'Loan count by state')

# Grade count by loan status

# Ensure 'grade' is treated as a string and drop NaN values if any
filtered_df['grade'] = filtered_df['grade'].astype(str)
order = sorted(filtered_df["grade"].unique())

# Plot
fig, ax = plt.subplots(figsize=(12, 8))
sns.despine()
sns.countplot(data=filtered_df, x="grade", hue="loan_status", order=order)
ax.tick_params(axis='x', labelrotation=0)
ax.set(xlabel='Grade', ylabel='Count')
ax.set_title('Grade assigned by LC', size=20)
plt.tight_layout()
plt.show()

# Term count by loan status

fig, ax =plt.subplots(figsize=(12,8)) 
sns.despine() 
order=sorted(filtered_df["term"].unique())
sns.countplot(data=filtered_df,x="term",hue="loan_status",order=order)
ax.tick_params(axis='x', labelrotation=0)
ax.set(xlabel='Months', ylabel='')
ax.set_title('Term of the loan', size=20)

Text(0.5, 1.0, 'Term of the loan')

# Purpose of loan count by loan status

fig, ax =plt.subplots(1,2,figsize=(20,8))

sns.despine() 

ax[0].tick_params(axis='x', labelrotation=90)
ax[0].set(xlabel='Purpose', ylabel='')
ax[0].set_title('Purpose of loan - Full', size=20)
ax[1].tick_params(axis='x', labelrotation=90)
ax[1].set(xlabel='Purpose', ylabel='')
ax[1].set_title('Purpose of loan - Last values zoom-in', size=20)

sns.countplot(data=filtered_df,x="purpose",hue="loan_status",
              order=filtered_df["purpose"].value_counts().index,ax=ax[0])

sns.countplot(data=filtered_df,x="purpose",hue="loan_status",
              order=["house","wedding","renewable_energy",
                    "educational"],ax=ax[1])

<Axes: title={'center': 'Purpose of loan - Last values zoom-in'}, xlabel='Purpose', ylabel='count'>

# Home ownership status count by loan status

fig, ax =plt.subplots(1,2,figsize=(20,8))

sns.despine() 

ax[0].tick_params(axis='x', labelrotation=0)
ax[0].set(xlabel='Ownership status', ylabel='')
ax[0].set_title('Ownership - Full', size=20)
ax[1].tick_params(axis='x', labelrotation=0)
ax[1].set(xlabel='Ownership status', ylabel='')
ax[1].set_title('Ownership - Last values zoom-in', size=20)

sns.countplot(data=filtered_df,x="home_ownership",hue="loan_status",ax=ax[0])
sns.countplot(data=filtered_df,x="home_ownership",hue="loan_status",order=["ANY","NONE","OTHER"],ax=ax[1])

<Axes: title={'center': 'Ownership - Last values zoom-in'}, xlabel='Ownership status', ylabel='count'>

# Installment amount count by loan status

fig, ax =plt.subplots(1,2,figsize=(20,8))

sns.despine() 

ax[0].tick_params(axis='x', labelrotation=0)
ax[0].set(xlabel='Installments amount in USD', ylabel='')
ax[0].set_title('Installment amount by loan type - Distribution', size=20)
ax[1].tick_params(axis='x', labelrotation=0)
ax[1].set_title('Installment amount by loan type - Boxplot', size=20)


sns.histplot(data=filtered_df,x="installment",hue="loan_status",bins=30,
            kde=True,ax=ax[0])
sns.boxplot(data=filtered_df,x="loan_status",y="installment",ax=ax[1]).set(xlabel='Loan Status|', 
                                                                       ylabel='Amount in USD')

[Text(0.5, 0, 'Loan Status|'), Text(0, 0.5, 'Amount in USD')]

# Interest rate count by loan status

fig, ax =plt.subplots(1,2,figsize=(20,8))

sns.despine() 

ax[0].tick_params(axis='x', labelrotation=0)
ax[0].set(xlabel='Interest rate in %', ylabel='')
ax[0].set_title('Interest rate by loan type - Distribution', size=20)
ax[1].tick_params(axis='x', labelrotation=0)
ax[1].set_title('Interest rate by loan type - Boxplot', size=20)


sns.histplot(data=filtered_df,x="int_rate",hue="loan_status",bins=30,
            kde=True,ax=ax[0])

sns.boxplot(data=filtered_df,x="loan_status",y="int_rate",ax=ax[1]).set(xlabel='Loan Status', 
                                                                    ylabel='Interest rate in %')

[Text(0.5, 0, 'Loan Status'), Text(0, 0.5, 'Interest rate in %')]

loan_amnt_box = filtered_df.hvplot.box(
    y='loan_amnt', subplots=True, by='loan_status', width=300, height=350, 
    title="Loan Status by Loan Amount ", xlabel='Loan Status', ylabel='Loan Amount'
)

installment_box = filtered_df.hvplot.box(
    y='installment', subplots=True, by='loan_status', width=300, height=350, 
    title="Loan Status by Installment", xlabel='Loan Status', ylabel='Installment'
)

loan_amnt_box + installment_box

def pub_rec(number):
    if number == 0.0:
        return 0
    else:
        return 1
    
def delinq_amnt(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number
    
def acc_now_delinq(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number

filtered_df['pub_rec'] = filtered_df.pub_rec.apply(pub_rec)
filtered_df['delinq_amnt'] = filtered_df.delinq_amnt.apply(delinq_amnt)
filtered_df['acc_now_delinq'] = filtered_df.acc_now_delinq.apply(acc_now_delinq)

plt.figure(figsize=(12, 30))

plt.subplot(6, 2, 1)
sns.countplot(x='pub_rec', data=filtered_df, hue='loan_status')

plt.subplot(6, 2, 2)
sns.countplot(x='initial_list_status', data=filtered_df, hue='loan_status')

plt.subplot(6, 2, 3)
sns.countplot(x='application_type', data=filtered_df, hue='loan_status')

plt.subplot(6, 2, 4)
sns.countplot(x='delinq_amnt', data=filtered_df, hue='loan_status')

plt.subplot(6, 2, 5)
sns.countplot(x='acc_now_delinq', data=filtered_df, hue='loan_status')

<Axes: xlabel='acc_now_delinq', ylabel='count'>

encoder = ce.OrdinalEncoder()
encoded_data = encoder.fit_transform(filtered_df)

# Compute correlation on all columns
encoded_data.corr()['loan_status'] \
    .drop('loan_status') \
    .sort_values() \
    .hvplot.barh(
        width=600, height=400,
        title="Correlation between Loan Status and All Features",
        ylabel='Correlation', xlabel='All Encoded Features'
    )

# Define the mapping for "Good Loan" and "Bad Loan"
Good_Loan_statuses = [
    'Fully Paid',
    'In Grace Period',
    'Does not meet the credit policy. Status:Fully Paid'
]

# Update the loan_status column
df['loan_status'] = df['loan_status'].apply(
    lambda x: 'Good Loan' if x in Good_Loan_statuses else 'Bad Loan'
)

# Verify the updated values
print(df["loan_status"].unique())

['Good Loan' 'Bad Loan']

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1381834 entries, 0 to 2260697
Data columns (total 22 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   loan_amnt                1381834 non-null  float64
 1   term                     1381834 non-null  object 
 2   int_rate                 1381834 non-null  float64
 3   installment              1381834 non-null  float64
 4   grade                    1381834 non-null  object 
 5   home_ownership           1381834 non-null  object 
 6   annual_inc               1381834 non-null  float64
 7   verification_status      1381834 non-null  object 
 8   loan_status              1381834 non-null  object 
 9   purpose                  1381834 non-null  object 
 10  addr_state               1381834 non-null  object 
 11  dti                      1381834 non-null  float64
 12  pub_rec                  1381834 non-null  float64
 13  initial_list_status      1381834 non-null  object 
 14  recoveries               1381834 non-null  float64
 15  collection_recovery_fee  1381834 non-null  float64
 16  last_fico_range_high     1381834 non-null  float64
 17  last_fico_range_low      1381834 non-null  float64
 18  application_type         1381834 non-null  object 
 19  acc_now_delinq           1381834 non-null  float64
 20  delinq_amnt              1381834 non-null  float64
 21  tax_liens                1381834 non-null  float64
dtypes: float64(13), object(9)
memory usage: 242.5+ MB

print([column for column in df.columns if df[column].dtype == object])

['term', 'grade', 'home_ownership', 'verification_status', 'loan_status', 'purpose', 'addr_state', 'initial_list_status', 'application_type']

print([column for column in df.columns if pd.api.types.is_numeric_dtype(df[column])])

['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'pub_rec', 'recoveries', 'collection_recovery_fee', 'last_fico_range_high', 'last_fico_range_low', 'acc_now_delinq', 'delinq_amnt', 'tax_liens']

# Encoding target values to dummy values 

df['loan_status'] = df['loan_status'].map({'Good Loan':0,'Bad Loan':1})

df.term.unique()

array([' 36 months', ' 60 months'], dtype=object)

term_values = {' 36 months': 36, ' 60 months': 60}
df['term'] = df.term.map(term_values)

df.term.unique()

array([36, 60])

dummies = ['grade', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 
           'initial_list_status', 'application_type']
df = pd.get_dummies(df, columns=dummies, drop_first=True)

df.head()

w_p = df.loan_status.value_counts()[0] / df.shape[0]
w_n = df.loan_status.value_counts()[1] / df.shape[0]

print(f"Weight of positive values {w_p}")
print(f"Weight of negative values {w_n}")

Weight of positive values 0.7864685627940838
Weight of negative values 0.2135314372059162

train, test = train_test_split(df, test_size=0.33, random_state=42)

print(train.shape)
print(test.shape)

(925828, 93)
(456006, 93)

print(train.shape)
train = train[train['annual_inc'] <= 250000]
train = train[train['dti'] <= 50]
print(train.shape)

(925828, 93)
(914316, 93)

X_train, y_train = train.drop('loan_status', axis=1), train.loan_status
X_test, y_test = test.drop('loan_status', axis=1), test.loan_status

X_train.head()

y_train.head()

1304582    0
1118221    1
1862395    1
420596     1
2035521    1
Name: loan_status, dtype: int64

X_train.dtypes

loan_amnt                     float64
term                            int64
int_rate                      float64
installment                   float64
annual_inc                    float64
                               ...   
addr_state_WI                    bool
addr_state_WV                    bool
addr_state_WY                    bool
initial_list_status_w            bool
application_type_Joint App       bool
Length: 92, dtype: object

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n________________________________________________")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n________________________________________________")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
y_train = np.array(y_train).astype(np.float32)
y_test = np.array(y_test).astype(np.float32)

# Initialize model
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Train
lr_clf.fit(X_train, y_train)

# Predictions
y_train_pred = lr_clf.predict(X_train)
y_test_pred = lr_clf.predict(X_test)

# Evaluate
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 92.74%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.94      0.87      0.93       0.91          0.93
recall         0.97      0.78      0.93       0.87          0.93
f1-score       0.95      0.82      0.93       0.89          0.93
support   719181.00 195135.00      0.93  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[696420  22761]
 [ 43594 151541]]

Test Result:
________________________________________________
Accuracy Score: 92.68%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.94     0.87      0.93       0.90          0.93
recall         0.97     0.78      0.93       0.87          0.93
f1-score       0.95     0.82      0.93       0.89          0.93
support   358208.00 97798.00      0.93  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[346817  11391]
 [ 21981  75817]]

disp = ConfusionMatrixDisplay.from_estimator(
    lr_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

disp = RocCurveDisplay.from_estimator(lr_clf, X_test, y_test)

scores_dict = {
    'Logistic Regression': {
        'Train': roc_auc_score(y_train, lr_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, lr_clf.predict(X_test)),
    },
}

# Initialize Decision Tree model
dt_clf = DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=42)

# Train the model
dt_clf.fit(X_train, y_train)

# Predictions
y_train_pred = dt_clf.predict(X_train)
y_test_pred = dt_clf.predict(X_test)

# Evaluate
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 94.34%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.95      0.91      0.94       0.93          0.94
recall         0.98      0.82      0.94       0.90          0.94
f1-score       0.96      0.86      0.94       0.91          0.94
support   719181.00 195135.00      0.94  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[703447  15734]
 [ 36016 159119]]

Test Result:
________________________________________________
Accuracy Score: 94.16%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.95     0.91      0.94       0.93          0.94
recall         0.98     0.81      0.94       0.89          0.94
f1-score       0.96     0.86      0.94       0.91          0.94
support   358208.00 97798.00      0.94  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[350067   8141]
 [ 18508  79290]]

disp = ConfusionMatrixDisplay.from_estimator(
    dt_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

disp = RocCurveDisplay.from_estimator(dt_clf, X_test, y_test)

scores_dict['Decision Tree'] = {
        'Train': roc_auc_score(y_train, dt_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, dt_clf.predict(X_test)),
    }

# Initialize model
gnb_clf = GaussianNB()

# Train
gnb_clf.fit(X_train, y_train)

# Predictions
y_train_pred = gnb_clf.predict(X_train)
y_test_pred = gnb_clf.predict(X_test)

# Evaluate
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 90.66%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.92      0.84      0.91       0.88          0.90
recall         0.96      0.69      0.91       0.83          0.91
f1-score       0.94      0.76      0.91       0.85          0.90
support   719181.00 195135.00      0.91  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[693934  25247]
 [ 60159 134976]]

Test Result:
________________________________________________
Accuracy Score: 90.57%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.92     0.84      0.91       0.88          0.90
recall         0.96     0.69      0.91       0.83          0.91
f1-score       0.94     0.76      0.91       0.85          0.90
support   358208.00 97798.00      0.91  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[345411  12797]
 [ 30219  67579]]

disp = ConfusionMatrixDisplay.from_estimator(
    gnb_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

disp = RocCurveDisplay.from_estimator(gnb_clf, X_test, y_test)

scores_dict['GNB'] = {
        'Train': roc_auc_score(y_train, gnb_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, gnb_clf.predict(X_test)),
    }

# Initialize model
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train
gb_clf.fit(X_train, y_train)

# Predictions
y_train_pred = gb_clf.predict(X_train)
y_test_pred = gb_clf.predict(X_test)

# Evaluate
print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 94.35%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.95      0.92      0.94       0.93          0.94
recall         0.98      0.81      0.94       0.89          0.94
f1-score       0.96      0.86      0.94       0.91          0.94
support   719181.00 195135.00      0.94  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[704611  14570]
 [ 37111 158024]]

Test Result:
________________________________________________
Accuracy Score: 94.32%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.95     0.92      0.94       0.93          0.94
recall         0.98     0.81      0.94       0.89          0.94
f1-score       0.96     0.86      0.94       0.91          0.94
support   358208.00 97798.00      0.94  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[351035   7173]
 [ 18740  79058]]

disp = ConfusionMatrixDisplay.from_estimator(
    gb_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

disp = RocCurveDisplay.from_estimator(gb_clf, X_test, y_test)

scores_dict['Gradient Boosting'] = {
        'Train': roc_auc_score(y_train, gb_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, gb_clf.predict(X_test)),
    }

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)

y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      1.00      1.00      1.00       1.00          1.00
recall         1.00      1.00      1.00       1.00          1.00
f1-score       1.00      1.00      1.00       1.00          1.00
support   719181.00 195135.00      1.00  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[719181      0]
 [    22 195113]]

Test Result:
________________________________________________
Accuracy Score: 94.53%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.95     0.92      0.95       0.94          0.94
recall         0.98     0.82      0.95       0.90          0.95
f1-score       0.97     0.86      0.95       0.92          0.94
support   358208.00 97798.00      0.95  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[351279   6929]
 [ 18022  79776]]

disp = ConfusionMatrixDisplay.from_estimator(rf_clf, X_test, y_test, 
                             cmap='Blues', values_format='d', 
                             display_labels=['Default', 'Fully-Paid'])

disp = RocCurveDisplay.from_estimator(rf_clf, X_test, y_test)

scores_dict['Random Forest'] = {
        'Train': roc_auc_score(y_train, rf_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, rf_clf.predict(X_test)),
    }

xgb_clf = XGBClassifier(use_label_encoder=False)
xgb_clf.fit(X_train, y_train)

y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
________________________________________________
Accuracy Score: 95.16%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.96      0.93      0.95       0.94          0.95
recall         0.98      0.84      0.95       0.91          0.95
f1-score       0.97      0.88      0.95       0.93          0.95
support   719181.00 195135.00      0.95  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[706188  12993]
 [ 31270 163865]]

Test Result:
________________________________________________
Accuracy Score: 94.89%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.96     0.92      0.95       0.94          0.95
recall         0.98     0.83      0.95       0.91          0.95
f1-score       0.97     0.87      0.95       0.92          0.95
support   358208.00 97798.00      0.95  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[351288   6920]
 [ 16388  81410]]

disp = ConfusionMatrixDisplay.from_estimator(
    xgb_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

disp = RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test)

scores_dict['XGBoost'] = {
        'Train': roc_auc_score(y_train, xgb_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, xgb_clf.predict(X_test)),
    }

def evaluate_nn(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
def plot_learning_evolution(r):
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.plot(r.history['loss'], label='Loss')
    plt.plot(r.history['val_loss'], label='val_Loss')
    plt.title('Loss evolution during trainig')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(r.history['AUC'], label='AUC')
    plt.plot(r.history['val_AUC'], label='val_AUC')
    plt.title('AUC score evolution during trainig')
    plt.legend();

def nn_model(num_columns, num_labels, hidden_units, dropout_rates, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns, ))
    x = BatchNormalization()(inp)
    x = Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = Dense(hidden_units[i], activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout_rates[i + 1])(x)
    x = Dense(num_labels, activation='sigmoid')(x)
  
    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=Adam(learning_rate), loss='binary_crossentropy', metrics=[AUC(name='AUC')])
    return model

def setup_gpu():
    """Setup GPU configuration with error handling"""
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Enable memory growth
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"Using {len(gpus)} GPU(s) with memory growth")
        except RuntimeError as e:
            print(f"GPU setup error: {e}")
            print("Falling back to CPU")
            tf.config.set_visible_devices([], 'GPU')
    else:
        print("No GPU available, using CPU")

# Setup GPU
setup_gpu()

# --- Your model setup ---
num_columns = X_train.shape[1]
num_labels = 1
hidden_units = [150, 150, 150]
dropout_rates = [0.1, 0, 0.1, 0]
learning_rate = 1e-3

# Assuming nn_model is your custom function that builds a tf.keras model
model = nn_model(
    num_columns=num_columns, 
    num_labels=num_labels,
    hidden_units=hidden_units,
    dropout_rates=dropout_rates,
    learning_rate=learning_rate
)

# Train with error handling
try:
    r = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=20,
        batch_size=32,
        verbose=1
    )
    print("Training completed successfully!")
    
except Exception as e:
    print(f"Training error: {e}")
    print("Trying with CPU...")
    
    # Fallback to CPU
    tf.config.set_visible_devices([], 'GPU')
    model = nn_model(
        num_columns=num_columns, 
        num_labels=num_labels,
        hidden_units=hidden_units,
        dropout_rates=dropout_rates,
        learning_rate=learning_rate
    )
    
    r = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=20,
        batch_size=32,
        verbose=1
    )

2025-11-18 10:14:02.963038: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)

No GPU available, using CPU
Epoch 1/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 99s 3ms/step - AUC: 0.9584 - loss: 0.1863 - val_AUC: 0.9677 - val_loss: 0.1640
Epoch 2/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 96s 3ms/step - AUC: 0.9628 - loss: 0.1741 - val_AUC: 0.9683 - val_loss: 0.1669
Epoch 3/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 96s 3ms/step - AUC: 0.9639 - loss: 0.1713 - val_AUC: 0.9687 - val_loss: 0.1677
Epoch 4/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 110s 4ms/step - AUC: 0.9643 - loss: 0.1701 - val_AUC: 0.9686 - val_loss: 0.1705
Epoch 5/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 121s 4ms/step - AUC: 0.9644 - loss: 0.1699 - val_AUC: 0.9688 - val_loss: 0.1695
Epoch 6/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 121s 4ms/step - AUC: 0.9645 - loss: 0.1696 - val_AUC: 0.9687 - val_loss: 0.1749
Epoch 7/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 125s 4ms/step - AUC: 0.9649 - loss: 0.1690 - val_AUC: 0.9685 - val_loss: 0.1727
Epoch 8/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 122s 4ms/step - AUC: 0.9648 - loss: 0.1690 - val_AUC: 0.9690 - val_loss: 0.1804
Epoch 9/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 122s 4ms/step - AUC: 0.9651 - loss: 0.1686 - val_AUC: 0.9687 - val_loss: 0.1796
Epoch 10/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 122s 4ms/step - AUC: 0.9651 - loss: 0.1685 - val_AUC: 0.9688 - val_loss: 0.1663
Epoch 11/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 120s 4ms/step - AUC: 0.9651 - loss: 0.1682 - val_AUC: 0.9693 - val_loss: 0.1665
Epoch 12/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 122s 4ms/step - AUC: 0.9652 - loss: 0.1681 - val_AUC: 0.9682 - val_loss: 0.1813
Epoch 13/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 124s 4ms/step - AUC: 0.9653 - loss: 0.1678 - val_AUC: 0.9687 - val_loss: 0.2017
Epoch 14/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 123s 4ms/step - AUC: 0.9651 - loss: 0.1684 - val_AUC: 0.9678 - val_loss: 0.1829
Epoch 15/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 124s 4ms/step - AUC: 0.9654 - loss: 0.1677 - val_AUC: 0.9691 - val_loss: 0.2634
Epoch 16/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 127s 4ms/step - AUC: 0.9657 - loss: 0.1673 - val_AUC: 0.9686 - val_loss: 0.1743
Epoch 17/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 122s 4ms/step - AUC: 0.9656 - loss: 0.1671 - val_AUC: 0.9688 - val_loss: 0.1779
Epoch 18/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 123s 4ms/step - AUC: 0.9655 - loss: 0.1674 - val_AUC: 0.9689 - val_loss: 0.1688
Epoch 19/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 121s 4ms/step - AUC: 0.9655 - loss: 0.1673 - val_AUC: 0.9689 - val_loss: 0.1653
Epoch 20/20
28573/28573 ━━━━━━━━━━━━━━━━━━━━ 124s 4ms/step - AUC: 0.9655 - loss: 0.1673 - val_AUC: 0.9685 - val_loss: 0.2408
Training completed successfully!

plot_learning_evolution(r)

y_train_pred = model.predict(X_train)
evaluate_nn(y_train, y_train_pred.round(), train=True)

28573/28573 ━━━━━━━━━━━━━━━━━━━━ 33s 1ms/step
Train Result:
================================================
Accuracy Score: 93.59%
_______________________________________________
CLASSIFICATION REPORT:
                0.0       1.0  accuracy  macro avg  weighted avg
precision      0.93      0.95      0.94       0.94          0.94
recall         0.99      0.74      0.94       0.87          0.94
f1-score       0.96      0.83      0.94       0.90          0.93
support   719181.00 195135.00      0.94  914316.00     914316.00
_______________________________________________
Confusion Matrix: 
 [[710799   8382]
 [ 50200 144935]]

y_test_pred = model.predict(X_test)
evaluate_nn(y_test, y_test_pred.round(), train=False)

14251/14251 ━━━━━━━━━━━━━━━━━━━━ 16s 1ms/step
Test Result:
================================================
Accuracy Score: 93.40%
_______________________________________________
CLASSIFICATION REPORT:
                0.0      1.0  accuracy  macro avg  weighted avg
precision      0.93     0.94      0.93       0.94          0.93
recall         0.99     0.74      0.93       0.86          0.93
f1-score       0.96     0.83      0.93       0.89          0.93
support   358208.00 97798.00      0.93  456006.00     456006.00
_______________________________________________
Confusion Matrix: 
 [[353616   4592]
 [ 25494  72304]]

scores_dict['ANNs'] = {
        'Train': roc_auc_score(y_train, model.predict(X_train)),
        'Test': roc_auc_score(y_test, model.predict(X_test)),
    }

28573/28573 ━━━━━━━━━━━━━━━━━━━━ 33s 1ms/step
14251/14251 ━━━━━━━━━━━━━━━━━━━━ 17s 1ms/step

ml_models = {
    'Logistic Regression': lr_clf,
    'Decision Tree': dt_clf,
    'GNB': gnb_clf,
    'Gradient Boosting': gb_clf,
    'Random Forest': rf_clf,
    'XGBoost': xgb_clf,
    'ANNs': model
}

for name, clf in ml_models.items():
    try:
        # Special handling: Keras models need predict() output flattened
        if 'ANN' in name:
            y_pred_prob = clf.predict(X_test).ravel()
        elif hasattr(clf, "predict_proba"):
            y_pred_prob = clf.predict_proba(X_test)[:, 1]
        else:
            # fallback for models without predict_proba (e.g., some regressors)
            y_pred_prob = clf.predict(X_test)
            # convert to 0–1 range if not probabilities
            y_pred_prob = np.clip(y_pred_prob, 0, 1)
        
        auc = roc_auc_score(y_test, y_pred_prob)
        print(f"{name.upper():30} roc_auc_score: {auc:.3f}")
        
    except Exception as e:
        print(f"{name.upper():30} error: {e}")

LOGISTIC REGRESSION            roc_auc_score: 0.964
DECISION TREE                  roc_auc_score: 0.973
GNB                            roc_auc_score: 0.925
GRADIENT BOOSTING              roc_auc_score: 0.974
RANDOM FOREST                  roc_auc_score: 0.973
XGBOOST                        roc_auc_score: 0.977
14251/14251 ━━━━━━━━━━━━━━━━━━━━ 17s 1ms/step
ANNS                           roc_auc_score: 0.968

scores_df = pd.DataFrame(scores_dict)
scores_df.hvplot.barh(
    width=500, height=400, 
    title="ROC Scores of ML Models", xlabel="ROC Scores", 
    alpha=0.4, legend='top'
)

	loan_amnt	int_rate	installment	annual_inc	dti	pub_rec	recoveries	collection_recovery_fee	last_fico_range_high	last_fico_range_low	acc_now_delinq	delinq_amnt	tax_liens
count	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00	2258852.00
mean	15044.31	13.09	445.74	78051.79	18.82	0.20	143.96	24.00	687.65	675.53	0.00	12.38	0.05
std	9188.00	4.83	267.11	112720.16	14.18	0.57	748.38	131.26	72.97	111.11	0.07	726.74	0.38
min	500.00	5.31	4.93	0.00	-1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00
25%	8000.00	9.49	251.62	46000.00	11.90	0.00	0.00	0.00	654.00	650.00	0.00	0.00	0.00
50%	12900.00	12.62	377.94	65000.00	17.84	0.00	0.00	0.00	699.00	695.00	0.00	0.00	0.00
75%	20000.00	15.99	593.06	93000.00	24.49	0.00	0.00	0.00	734.00	730.00	0.00	0.00	0.00
max	40000.00	30.99	1719.83	110000000.00	999.00	86.00	39859.55	7174.72	850.00	845.00	14.00	249925.00	85.00

	loan_amnt	term	int_rate	installment	grade	home_ownership	annual_inc	verification_status	loan_status	purpose	addr_state	dti	pub_rec	initial_list_status	last_fico_range_high	last_fico_range_low	application_type
767727	10000.00	36 months	16.14	352.27	C	RENT	60000.00	Source Verified	Good Loan	other	CA	11.90	0.00	w	679.00	675.00	Individual
2008787	12000.00	36 months	10.49	389.98	B	MORTGAGE	69000.00	Source Verified	Good Loan	debt_consolidation	GA	17.40	1.00	w	704.00	700.00	Individual
119744	12000.00	60 months	16.99	298.17	D	MORTGAGE	80000.00	Not Verified	Good Loan	other	FL	34.80	0.00	w	684.00	680.00	Individual
2157409	9525.00	36 months	11.39	313.60	B	MORTGAGE	100000.00	Not Verified	Good Loan	debt_consolidation	FL	9.18	0.00	w	714.00	710.00	Individual
1961525	25000.00	36 months	12.79	839.83	C	MORTGAGE	88000.00	Verified	Good Loan	debt_consolidation	NH	18.30	0.00	f	719.00	715.00	Individual

Loan Defaulters Prediction of Lending Club Dataset¶

Introduction¶

Import Libraries¶

Load Dataset¶

Exploratory Data Analysis¶

Define the mapping loan_status for "Good Loan" and "Bad Loan"¶

Data PreProcessing¶

Train Test Split¶

Removing Outliers¶

Normalizing the data¶

Models Building¶

Logistic Regression¶

Decision Tree Classifier¶

Gaussian Naive Bayes¶

Gradient Boosting¶

Random Forest Classifier¶

XGBoost Classifier¶

Artificial Neural Networks (ANNs)¶

References¶

	id	loan_amnt	term	int_rate	installment	grade	home_ownership	annual_inc	verification_status	loan_status	purpose	addr_state	dti	initial_list_status	last_fico_range_high	last_fico_range_low	application_type
0	68407277	3600.00	36 months	13.99	123.03	C	MORTGAGE	55000.00	Not Verified	Fully Paid	debt_consolidation	PA	5.91	w	564.00	560.00	Individual
1	68355089	24700.00	36 months	11.99	820.28	C	MORTGAGE	65000.00	Not Verified	Fully Paid	small_business	SD	16.06	w	699.00	695.00	Individual
2	68341763	20000.00	60 months	10.78	432.66	B	MORTGAGE	63000.00	Not Verified	Fully Paid	home_improvement	IL	10.78	w	704.00	700.00	Joint App
3	66310712	35000.00	60 months	14.85	829.90	C	MORTGAGE	110000.00	Source Verified	Current	debt_consolidation	NJ	17.06	w	679.00	675.00	Individual
4	68476807	10400.00	60 months	22.45	289.91	F	MORTGAGE	104433.00	Source Verified	Fully Paid	major_purchase	PA	25.37	w	704.00	700.00	Individual

	loan_amnt	term	int_rate	installment	annual_inc	dti	recoveries	collection_recovery_fee	last_fico_range_high	last_fico_range_low	grade_B	grade_C	grade_D	grade_E	grade_F	grade_G	home_ownership_MORTGAGE	home_ownership_NONE	home_ownership_OTHER	home_ownership_OWN	home_ownership_RENT	...	addr_state_ND	addr_state_NE	addr_state_NH	addr_state_NJ	addr_state_NM	addr_state_NV	addr_state_NY	addr_state_OH	addr_state_OK	addr_state_OR	addr_state_PA	addr_state_RI	addr_state_SC	addr_state_SD	addr_state_TN	addr_state_TX	addr_state_UT	addr_state_VA	addr_state_VT	addr_state_WA	addr_state_WI	addr_state_WV	addr_state_WY	initial_list_status_w	application_type_Joint App
1304582	25000.00	36	15.31	870.44	150000.00	12.68	0.00	0.00	639.00	635.00	False	True	False	False	False	False	False	False	False	True	False	...	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1118221	4850.00	36	22.99	187.72	48000.00	20.95	231.28	41.63	559.00	555.00	False	False	False	False	True	False	True	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1862395	6625.00	36	13.11	223.58	22500.00	33.23	487.13	86.96	609.00	605.00	True	False	False	False	False	False	False	False	False	False	True	...	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
420596	10000.00	36	12.39	334.01	31814.00	36.70	0.00	0.00	634.00	630.00	False	True	False	False	False	False	False	False	False	False	True	...	False	False	False	False	False	False	False	False	False	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2035521	12000.00	36	13.99	410.08	30000.00	7.00	0.00	0.00	519.00	515.00	False	True	False	False	False	False	False	False	False	False	True	...	False	False	False	False	False	True	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False