In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
df=pd.read_csv('Bank_Churn.csv')
Check top 5 rows of the dataset
In [4]:
df.head()
Out[4]:
CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
1 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
2 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
3 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
4 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
Information and statistics about the dataset
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerId 10000 non-null int64 1 Surname 10000 non-null object 2 CreditScore 10000 non-null int64 3 Geography 10000 non-null object 4 Gender 10000 non-null object 5 Age 10000 non-null int64 6 Tenure 10000 non-null int64 7 Balance 10000 non-null float64 8 NumOfProducts 10000 non-null int64 9 HasCrCard 10000 non-null int64 10 IsActiveMember 10000 non-null int64 11 EstimatedSalary 10000 non-null float64 12 Exited 10000 non-null int64 dtypes: float64(2), int64(8), object(3) memory usage: 1015.8+ KB
In [6]:
df.describe(include='all')
Out[6]:
CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.000000e+04 | 10000 | 10000.000000 | 10000 | 10000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.000000 | 10000.000000 | 10000.000000 |
unique | NaN | 2932 | NaN | 3 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
top | NaN | Smith | NaN | France | Male | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
freq | NaN | 32 | NaN | 5014 | 5457 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mean | 1.569094e+07 | NaN | 650.528800 | NaN | NaN | 38.921800 | 5.012800 | 76485.889288 | 1.530200 | 0.70550 | 0.515100 | 100090.239881 | 0.203700 |
std | 7.193619e+04 | NaN | 96.653299 | NaN | NaN | 10.487806 | 2.892174 | 62397.405202 | 0.581654 | 0.45584 | 0.499797 | 57510.492818 | 0.402769 |
min | 1.556570e+07 | NaN | 350.000000 | NaN | NaN | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 11.580000 | 0.000000 |
25% | 1.562853e+07 | NaN | 584.000000 | NaN | NaN | 32.000000 | 3.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 51002.110000 | 0.000000 |
50% | 1.569074e+07 | NaN | 652.000000 | NaN | NaN | 37.000000 | 5.000000 | 97198.540000 | 1.000000 | 1.00000 | 1.000000 | 100193.915000 | 0.000000 |
75% | 1.575323e+07 | NaN | 718.000000 | NaN | NaN | 44.000000 | 7.000000 | 127644.240000 | 2.000000 | 1.00000 | 1.000000 | 149388.247500 | 0.000000 |
max | 1.581569e+07 | NaN | 850.000000 | NaN | NaN | 92.000000 | 10.000000 | 250898.090000 | 4.000000 | 1.00000 | 1.000000 | 199992.480000 | 1.000000 |
Check Null values of the dataset
In [7]:
df.isnull().sum()
Out[7]:
CustomerId 0 Surname 0 CreditScore 0 Geography 0 Gender 0 Age 0 Tenure 0 Balance 0 NumOfProducts 0 HasCrCard 0 IsActiveMember 0 EstimatedSalary 0 Exited 0 dtype: int64
Dropping irrelevant columns
In [8]:
df.drop(columns=['CustomerId','Surname'],inplace=True,axis=1)
EDA
In [9]:
df['Exited'].value_counts()
Out[9]:
Exited 0 7963 1 2037 Name: count, dtype: int64
In [10]:
# Investigate all the elements whithin each Feature
for column in df:
unique_vals = np.unique(df[column].fillna('0'))
nr_values = len(unique_vals)
if nr_values <= 12:
print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
else:
print('The number of values for feature {} :{}'.format(column, nr_values))
The number of values for feature CreditScore :460 The number of values for feature Geography :3 -- ['France' 'Germany' 'Spain'] The number of values for feature Gender :2 -- ['Female' 'Male'] The number of values for feature Age :70 The number of values for feature Tenure :11 -- [ 0 1 2 3 4 5 6 7 8 9 10] The number of values for feature Balance :6382 The number of values for feature NumOfProducts :4 -- [1 2 3 4] The number of values for feature HasCrCard :2 -- [0 1] The number of values for feature IsActiveMember :2 -- [0 1] The number of values for feature EstimatedSalary :9999 The number of values for feature Exited :2 -- [0 1]
In [11]:
# Count Plot of our Y - Check the balance of the dataset
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x="Exited")
plt.title("Distribution of Exited", fontsize=16)
plt.xlabel("Exited", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks([0, 1], labels=["Not Exited", "Exited"])
plt.show()
In [12]:
df['Geography'].value_counts()
Out[12]:
Geography France 5014 Germany 2509 Spain 2477 Name: count, dtype: int64
In [13]:
df_grp=df.groupby('Geography')
df_grp['CreditScore'].mean()
Out[13]:
Geography France 649.668329 Germany 651.453567 Spain 651.333872 Name: CreditScore, dtype: float64
In [14]:
sns.pairplot(df,hue="Exited")
plt.show()
In [15]:
# Identify all numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = df.select_dtypes(include=numerics).columns
# Determine number of subplots (3 columns per row)
n_cols = 3
n_plots = len(numeric_cols)
n_rows = int(np.ceil(n_plots / n_cols))
# Set up the subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten() # flatten to 1D for easy indexing
# Plot each boxplot in the grid
for i, col in enumerate(numeric_cols):
sns.boxplot(data=df, x=col, ax=axes[i], color='#D1EC46')
axes[i].set_title(f'{col} (Median: {df[col].median():.2f})')
axes[i].set_xlabel("")
# Remove unused subplots if any
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
In [16]:
# Select object (categorical) columns
cat_cols = df.select_dtypes(include='object').columns
# Limit to top N columns by unique value count (optional)
# cat_cols = [col for col in cat_cols if df[col].nunique() < 20] # Limit high-cardinality cols
# Grid config: 3 plots per row
n_cols = 3
n_plots = len(cat_cols)
n_rows = int(np.ceil(n_plots / n_cols))
# Set up subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 5))
axes = axes.flatten()
# Plot count plots for each categorical column
for i, col in enumerate(cat_cols):
sns.countplot(y=col, data=df, order=df[col].value_counts().index, ax=axes[i], hue=col, palette='Set3', legend=False)
axes[i].set_title(f'Count Plot of {col}')
axes[i].set_xlabel("Count")
axes[i].set_ylabel(col)
axes[i].tick_params(axis='y', labelsize=9)
# Hide any extra subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
Encoding Categorical Data
In [17]:
df=pd.get_dummies(df,drop_first=True)
In [18]:
df.head()
Out[18]:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Geography_Germany | Geography_Spain | Gender_Male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 619 | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 | False | False | False |
1 | 608 | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 | False | True | False |
2 | 502 | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 | False | False | False |
3 | 699 | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 | False | False | False |
4 | 850 | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 | False | True | False |
In [19]:
plt.figure(figsize=(20,7))
sns.heatmap(df.corr(),annot=True)
plt.show()
In [20]:
df.corr()
Out[20]:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Geography_Germany | Geography_Spain | Gender_Male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
CreditScore | 1.000000 | -0.003965 | 0.000842 | 0.006268 | 0.012238 | -0.005458 | 0.025651 | -0.001384 | -0.027094 | 0.005538 | 0.004780 | -0.002857 |
Age | -0.003965 | 1.000000 | -0.009997 | 0.028308 | -0.030680 | -0.011721 | 0.085472 | -0.007201 | 0.285323 | 0.046897 | -0.001685 | -0.027544 |
Tenure | 0.000842 | -0.009997 | 1.000000 | -0.012254 | 0.013444 | 0.022583 | -0.028362 | 0.007784 | -0.014001 | -0.000567 | 0.003868 | 0.014733 |
Balance | 0.006268 | 0.028308 | -0.012254 | 1.000000 | -0.304180 | -0.014858 | -0.010084 | 0.012797 | 0.118533 | 0.401110 | -0.134892 | 0.012087 |
NumOfProducts | 0.012238 | -0.030680 | 0.013444 | -0.304180 | 1.000000 | 0.003183 | 0.009612 | 0.014204 | -0.047820 | -0.010419 | 0.009039 | -0.021859 |
HasCrCard | -0.005458 | -0.011721 | 0.022583 | -0.014858 | 0.003183 | 1.000000 | -0.011866 | -0.009933 | -0.007138 | 0.010577 | -0.013480 | 0.005766 |
IsActiveMember | 0.025651 | 0.085472 | -0.028362 | -0.010084 | 0.009612 | -0.011866 | 1.000000 | -0.011421 | -0.156128 | -0.020486 | 0.016732 | 0.022544 |
EstimatedSalary | -0.001384 | -0.007201 | 0.007784 | 0.012797 | 0.014204 | -0.009933 | -0.011421 | 1.000000 | 0.012097 | 0.010297 | -0.006482 | -0.008112 |
Exited | -0.027094 | 0.285323 | -0.014001 | 0.118533 | -0.047820 | -0.007138 | -0.156128 | 0.012097 | 1.000000 | 0.173488 | -0.052667 | -0.106512 |
Geography_Germany | 0.005538 | 0.046897 | -0.000567 | 0.401110 | -0.010419 | 0.010577 | -0.020486 | 0.010297 | 0.173488 | 1.000000 | -0.332084 | -0.024628 |
Geography_Spain | 0.004780 | -0.001685 | 0.003868 | -0.134892 | 0.009039 | -0.013480 | 0.016732 | -0.006482 | -0.052667 | -0.332084 | 1.000000 | 0.016889 |
Gender_Male | -0.002857 | -0.027544 | 0.014733 | 0.012087 | -0.021859 | 0.005766 | 0.022544 | -0.008112 | -0.106512 | -0.024628 | 0.016889 | 1.000000 |
Feature Scaling
In [21]:
df.head()
Out[21]:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Geography_Germany | Geography_Spain | Gender_Male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 619 | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 | False | False | False |
1 | 608 | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 | False | True | False |
2 | 502 | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 | False | False | False |
3 | 699 | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 | False | False | False |
4 | 850 | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 | False | True | False |
In [22]:
from sklearn.preprocessing import StandardScaler
# Assuming df is the original dataframe
columns_to_scale = ['CreditScore', 'EstimatedSalary', 'Tenure', 'Balance', 'Age', 'NumOfProducts']
# Create a new dataframe for scaling
df_scale = df.copy()
# Standardize the selected columns
scaler = StandardScaler()
df_scale[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
df_scale.head()
Out[22]:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Geography_Germany | Geography_Spain | Gender_Male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.326221 | 0.293517 | -1.041760 | -1.225848 | -0.911583 | 1 | 1 | 0.021886 | 1 | False | False | False |
1 | -0.440036 | 0.198164 | -1.387538 | 0.117350 | -0.911583 | 0 | 1 | 0.216534 | 0 | False | True | False |
2 | -1.536794 | 0.293517 | 1.032908 | 1.333053 | 2.527057 | 1 | 0 | 0.240687 | 1 | False | False | False |
3 | 0.501521 | 0.007457 | -1.387538 | -1.225848 | 0.807737 | 0 | 0 | -0.108918 | 0 | False | False | False |
4 | 2.063884 | 0.388871 | -1.041760 | 0.785728 | -0.911583 | 1 | 1 | -0.365276 | 0 | False | True | False |
Splitting the dataset
In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
In [24]:
X=df_scale.drop(columns=['Exited'],axis=1)
y=df_scale['Exited']
In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Logistic Regression
In [26]:
lmodel=LogisticRegression(max_iter=100)
In [27]:
lmodel.fit(X_train,y_train)
Out[27]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [28]:
y_predicted=lmodel.predict(X_test)
In [29]:
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay,accuracy_score
accuracy_score(y_test,y_predicted)
Out[29]:
0.8112121212121212
In [30]:
cm=confusion_matrix(y_test,y_predicted)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
In [31]:
print(classification_report(y_test,y_predicted))
precision recall f1-score support 0 0.83 0.96 0.89 2657 1 0.54 0.21 0.30 643 accuracy 0.81 3300 macro avg 0.69 0.58 0.59 3300 weighted avg 0.78 0.81 0.78 3300
DecisionTree
In [32]:
from sklearn.tree import DecisionTreeClassifier
In [33]:
dmodel=DecisionTreeClassifier(random_state=0)
In [34]:
dmodel.fit(X_train,y_train)
y_predicted=dmodel.predict(X_test)
In [35]:
print(classification_report(y_test,y_predicted))
precision recall f1-score support 0 0.88 0.86 0.87 2657 1 0.46 0.50 0.48 643 accuracy 0.79 3300 macro avg 0.67 0.68 0.67 3300 weighted avg 0.79 0.79 0.79 3300
In [36]:
cm=confusion_matrix(y_test,y_predicted)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
RandomForestClassifier
In [37]:
from sklearn.ensemble import RandomForestClassifier
In [38]:
rmodel=RandomForestClassifier(max_depth=200, random_state=0)
In [39]:
rmodel.fit(X_train,y_train)
y_predicted=rmodel.predict(X_test)
In [40]:
cm=confusion_matrix(y_test,y_predicted)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
In [41]:
print(classification_report(y_test,y_predicted))
precision recall f1-score support 0 0.88 0.97 0.92 2657 1 0.77 0.48 0.59 643 accuracy 0.87 3300 macro avg 0.83 0.72 0.76 3300 weighted avg 0.86 0.87 0.86 3300
Consideing all data(Random Forest)¶
In [42]:
y_pred_all=rmodel.predict(X)
In [43]:
cm=confusion_matrix(y,y_pred_all)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
In [44]:
print(classification_report(y,y_pred_all))
precision recall f1-score support 0 0.96 0.99 0.97 7963 1 0.95 0.84 0.89 2037 accuracy 0.96 10000 macro avg 0.95 0.91 0.93 10000 weighted avg 0.96 0.96 0.96 10000
Handle Imbalanced data Using SMOTE
In [46]:
from imblearn.over_sampling import SMOTE
In [47]:
# Step 1: Apply SMOTE to balance the training data
from collections import Counter
# Check original class distribution
print("Before SMOTE:", Counter(y_train))
# Step 1: Apply SMOTE to balance the training data
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# Check class distribution after SMOTE
print("After SMOTE:", Counter(y_train_smote))
Before SMOTE: Counter({0: 5306, 1: 1394}) After SMOTE: Counter({1: 5306, 0: 5306})
In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 2. Define your models
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"SVM": SVC(),
"Random Forest": RandomForestClassifier(),
"KNN": KNeighborsClassifier(),
"ANN (MLP)": MLPClassifier(max_iter=500),
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}
# 3. Train, predict & collect metrics
results = []
for name, model in models.items():
# _fit_ on the SMOTE’d train set
model.fit(X_train_smote, y_train_smote)
# _predict_ on the original test set
y_pred = model.predict(X_test)
results.append({
"Model": name,
"Accuracy": accuracy_score(y_test, y_pred),
"Precision": precision_score(y_test, y_pred),
"Recall": recall_score(y_test, y_pred),
"F1-Score": f1_score(y_test, y_pred)
})
# 4. Display sorted results
results_df = pd.DataFrame(results)
print(results_df.sort_values("F1-Score", ascending=False).round(4))
c:\Users\Mwangi\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\training.py:183: UserWarning: [18:12:29] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738: Parameters: { "use_label_encoder" } are not used. bst.update(dtrain, iteration=i, fobj=obj)
Model Accuracy Precision Recall F1-Score 2 Random Forest 0.8464 0.6040 0.6143 0.6091 5 XGBoost 0.8548 0.6464 0.5630 0.6018 1 SVM 0.7979 0.4873 0.7185 0.5808 4 ANN (MLP) 0.7961 0.4836 0.6890 0.5683 3 KNN 0.7521 0.4189 0.7030 0.5250 0 Logistic Regression 0.7012 0.3567 0.6641 0.4641
In [50]:
rf_smote=RandomForestClassifier()
rf_smote.fit(X_train_smote, y_train_smote)
y_pred = rf_smote.predict(X)
In [51]:
cm=confusion_matrix(y,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
Hyperparameter tuning on Random Forest using GridSearchCV
In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
# Define the base model
rf = RandomForestClassifier(
class_weight='balanced',
n_jobs=-1,
random_state=42
)
# Define the parameter distributions for RandomizedSearchCV
param_dist = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# F1 scorer
f1 = make_scorer(f1_score, average='macro') # or 'weighted', 'micro' depending on your case
# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator=rf,
param_distributions=param_dist,
n_iter=20,
scoring=f1,
cv=5,
n_jobs=-1,
verbose=2,
random_state=42
)
# Fit the model
random_search.fit(X_train_smote, y_train_smote)
# Best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)
# Evaluate on test set
best_rf = random_search.best_estimator_
print("Test Accuracy:", best_rf.score(X_test, y_test))
# Predict with the best model
y_predicted = best_rf.predict(X_test)
# Compute and display confusion matrix
cm = confusion_matrix(y_test, y_predicted)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix of Best Model (RandomForest)")
plt.show()
Fitting 5 folds for each of 20 candidates, totalling 100 fits Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30} Best F1 Score: 0.8979040353845351 Test Accuracy: 0.8454545454545455
Feature Importance (Random Forest)
In [53]:
importances = best_rf.feature_importances_
features = X.columns
feature_df = pd.DataFrame({'Feature': features, 'Importance': importances})
# Sort and save top features
feature_df = feature_df.sort_values(by='Importance', ascending=False)
print(feature_df.head(10))
feature_df.to_csv('feature_importance.csv', index=False)
# Plot
feature_df.nlargest(10, 'Importance').plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.gca().invert_yaxis() # To display the most important at the top
plt.tight_layout()
plt.show()
Feature Importance 1 Age 0.270309 3 Balance 0.128561 4 NumOfProducts 0.127060 7 EstimatedSalary 0.112884 0 CreditScore 0.110301 2 Tenure 0.098125 6 IsActiveMember 0.060269 8 Geography_Germany 0.043225 5 HasCrCard 0.017522 10 Gender_Male 0.017217
In [54]:
from sklearn.metrics import (
classification_report, confusion_matrix, roc_auc_score,
roc_curve, ConfusionMatrixDisplay, precision_recall_curve
)
# --- Predictions ---
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]
# --- Print Scores ---
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# --- ROC Curve ---
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc_score(y_test, y_proba):.2f}")
plt.plot([0, 1], [0, 1], 'k--', label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
precision recall f1-score support 0 0.91 0.90 0.90 2657 1 0.60 0.61 0.61 643 accuracy 0.85 3300 macro avg 0.75 0.76 0.76 3300 weighted avg 0.85 0.85 0.85 3300 ROC AUC: 0.8516217322006894
In [55]:
#Considering all actual data
y_pred_all=best_rf.predict(X)
cm=confusion_matrix(y,y_pred_all)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues') # You can choose another color map if you want
plt.title("Confusion Matrix")
plt.show()
In [56]:
print(classification_report(y, y_pred_all))
precision recall f1-score support 0 0.97 0.97 0.97 7963 1 0.87 0.88 0.88 2037 accuracy 0.95 10000 macro avg 0.92 0.92 0.92 10000 weighted avg 0.95 0.95 0.95 10000
In [57]:
y_test_one = best_rf.predict(X_test[0:8])
best_rf.predict_proba(X_test[0:8])
Out[57]:
array([[0.82333333, 0.17666667], [0.96333333, 0.03666667], [0.72 , 0.28 ], [0.8 , 0.2 ], [0.67333333, 0.32666667], [1. , 0. ], [0.79666667, 0.20333333], [0.50666667, 0.49333333]])
In [58]:
y_test_one
Out[58]:
array([0, 0, 0, 0, 0, 0, 0, 0])
In [59]:
y_test[0:8]
Out[59]:
6252 0 4684 0 1731 0 4742 0 4521 0 6340 0 576 0 5202 1 Name: Exited, dtype: int64
Model Calibration¶
In [60]:
X.loc[0,:]
Out[60]:
CreditScore -0.326221 Age 0.293517 Tenure -1.04176 Balance -1.225848 NumOfProducts -0.911583 HasCrCard 1 IsActiveMember 1 EstimatedSalary 0.021886 Geography_Germany False Geography_Spain False Gender_Male False Name: 0, dtype: object
In [61]:
from sklearn.calibration import CalibratedClassifierCV
calibrated_rf = CalibratedClassifierCV(best_rf, method='sigmoid', cv=5)
calibrated_rf.fit(X_train_smote, y_train_smote)
calib_preds = calibrated_rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, calib_preds))
# Compute and display confusion matrix
cm = confusion_matrix(y_test, calib_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix of Best Model (RandomFoest)")
plt.show()
Classification Report: precision recall f1-score support 0 0.91 0.91 0.91 2657 1 0.62 0.61 0.62 643 accuracy 0.85 3300 macro avg 0.77 0.76 0.76 3300 weighted avg 0.85 0.85 0.85 3300
Saving the model
In [62]:
import pickle
with open('model.pkl', 'wb') as file:
pickle.dump(calibrated_rf, file)
In [63]:
with open('scaler.pkl', 'wb') as f:
pickle.dump(scaler, f)
In [64]:
with open('RF.pkl', 'wb') as file:
pickle.dump(best_rf, file)
Data Preparation for PowerBi
In [65]:
df1=pd.read_csv('Bank_Churn.csv')
In [66]:
df1.head()
Out[66]:
CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
1 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
2 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
3 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
4 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
In [67]:
# Predict using best_rf on full X
prob =calibrated_rf.predict_proba(X)[:, 1]
pred = calibrated_rf.predict(X)
# Make sure index of X matches df1
results = pd.DataFrame({
'Exit_Probability': prob,
'Predicted': pred
}, index=X.index)
# Join predictions to original dataset
df_with_predictions = df1.join(results, how='left')
In [68]:
df_with_predictions
Out[68]:
CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Exit_Probability | Predicted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 | 0.339586 | 0 |
1 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 | 0.155344 | 0 |
2 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 | 0.982855 | 1 |
3 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 | 0.126286 | 0 |
4 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 | 0.040259 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 15606229 | Obijiaku | 771 | France | Male | 39 | 5 | 0.00 | 2 | 1 | 0 | 96270.64 | 0 | 0.013458 | 0 |
9996 | 15569892 | Johnstone | 516 | France | Male | 35 | 10 | 57369.61 | 1 | 1 | 1 | 101699.77 | 0 | 0.052480 | 0 |
9997 | 15584532 | Liu | 709 | France | Female | 36 | 7 | 0.00 | 1 | 0 | 1 | 42085.58 | 1 | 0.905769 | 1 |
9998 | 15682355 | Sabbatini | 772 | Germany | Male | 42 | 3 | 75075.31 | 2 | 1 | 0 | 92888.52 | 1 | 0.088806 | 0 |
9999 | 15628319 | Walker | 792 | France | Female | 28 | 4 | 130142.79 | 1 | 1 | 0 | 38190.78 | 0 | 0.022848 | 0 |
10000 rows × 15 columns
In [69]:
df_with_predictions['Predicted'].dtypes
Out[69]:
dtype('int64')
In [70]:
df_with_predictions[(df_with_predictions['Exited']==0) & (df_with_predictions['Predicted']==1)]
Out[70]:
CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | Exit_Probability | Predicted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
56 | 15630053 | Tsao | 656 | France | Male | 45 | 5 | 127864.40 | 1 | 1 | 0 | 87107.57 | 0 | 0.671391 | 1 |
59 | 15804771 | Velazquez | 614 | France | Male | 51 | 4 | 40685.92 | 1 | 1 | 1 | 46775.28 | 0 | 0.738268 | 1 |
79 | 15803136 | Postle | 416 | Germany | Female | 41 | 10 | 122189.66 | 2 | 1 | 0 | 98301.61 | 0 | 0.527592 | 1 |
107 | 15812878 | Parsons | 785 | Germany | Female | 36 | 2 | 99806.85 | 1 | 0 | 1 | 36976.52 | 0 | 0.575657 | 1 |
131 | 15718369 | Kaodilinakachukwu | 795 | Germany | Female | 33 | 9 | 130862.43 | 1 | 1 | 1 | 114935.21 | 0 | 0.926213 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9766 | 15795511 | Vasiliev | 800 | Germany | Male | 39 | 4 | 95252.72 | 1 | 1 | 0 | 13906.34 | 0 | 0.519170 | 1 |
9780 | 15617432 | Folliero | 816 | Germany | Female | 40 | 9 | 109003.26 | 1 | 1 | 1 | 79580.56 | 0 | 0.762386 | 1 |
9783 | 15680430 | Ajuluchukwu | 601 | Germany | Female | 49 | 4 | 96252.98 | 2 | 1 | 0 | 104263.82 | 0 | 0.953823 | 1 |
9893 | 15598331 | Morgan | 764 | France | Female | 40 | 9 | 100480.53 | 1 | 1 | 0 | 124095.69 | 0 | 0.522813 | 1 |
9942 | 15683007 | Torode | 739 | Germany | Female | 25 | 5 | 113113.12 | 1 | 1 | 0 | 129181.27 | 0 | 0.896753 | 1 |
239 rows × 15 columns
In [71]:
df_with_predictions['Predicted'].value_counts()
Out[71]:
Predicted 0 7972 1 2028 Name: count, dtype: int64
In [72]:
df_with_predictions['Exited'].value_counts()
Out[72]:
Exited 0 7963 1 2037 Name: count, dtype: int64
In [73]:
(df_with_predictions['Exited']-df_with_predictions['Predicted']!=0).sum()
Out[73]:
np.int64(487)
In [74]:
df_with_predictions.to_csv('Bank_churn_modified.csv')
In [ ]: