Credit - Defaulters - Prediction Using Logostic Regression
Credit - Defaulters - Prediction Using Logostic Regression
#Importing Libraries
[5]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#Load Dataset
[7]: df = pd.read_csv('german_credit_data.csv')
[8]: Unnamed: 0 Age Sex Job Housing Saving accounts Checking account \
0 0 67 male 2 own NaN little
1 1 22 female 2 own little moderate
2 2 49 male 1 own little NaN
3 3 45 male 2 free little little
4 4 53 male 2 free little little
5 5 35 male 1 free NaN NaN
6 6 53 male 2 own quite rich NaN
7 7 35 male 3 rent little moderate
8 8 61 male 1 own rich NaN
9 9 28 male 3 own little moderate
1
0 1169 6 radio/TV
1 5951 48 radio/TV
2 2096 12 education
3 7882 42 furniture/equipment
4 4870 24 car
5 9055 36 education
6 2835 24 furniture/equipment
7 6948 36 car
8 3059 12 radio/TV
9 5234 30 car
[9]: df.shape
[10]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 1000 non-null int64
1 Age 1000 non-null int64
2 Sex 1000 non-null object
3 Job 1000 non-null int64
4 Housing 1000 non-null object
5 Saving accounts 817 non-null object
6 Checking account 606 non-null object
7 Credit amount 1000 non-null int64
8 Duration 1000 non-null int64
9 Purpose 1000 non-null object
dtypes: int64(5), object(5)
memory usage: 78.2+ KB
[11]: df.describe(include='all')
2
75% 749.250000 42.000000 NaN 2.000000 NaN NaN
max 999.000000 75.000000 NaN 3.000000 NaN NaN
#EDA
[12]: df.hist(figsize=(20, 20))
plt.show()
3
[13]: df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
[14]: df.to_csv('german_credit_data.csv')
18424
4
else:
return 0 # Not creditworthy
[17]: df1=pd.read_csv('updated_file.csv')
df.head(2)
[17]: Age Sex Job Housing Saving accounts Checking account Credit amount \
0 67 male 2 own NaN little 1169
1 22 female 2 own little moderate 5951
5
[19]: plt.figure(figsize=(20,10))
sns.countplot(x='Duration', hue='Creditability', data=df1)
plt.xlabel('Duration')
plt.ylabel('total no of credit defaulters')
plt.show()
6
[20]: gender_df=df1.groupby(["Sex", "Creditability"])["Purpose"].value_counts()
gender_df
7
business 56
education 26
repairs 13
domestic appliances 6
vacation/others 3
Name: count, dtype: int64
[21]: plt.figure(figsize=(20,10))
ax=sns.countplot(x='Sex', hue='Job', data=df1)
plt.legend(title='Job',loc='upper right')
plt.show()
8
9
[23]: plt.figure(figsize=(30,10))
plt.ylabel("total no of credit defaulters")
dx=sns.countplot(x='Purpose', hue='Creditability', data=df1)
plt.show()
10
1: Good customers 2: Bad customers
[24]: plt.figure(figsize=(15,8))
plt.ylabel("total no of credit defaulters")
dx=sns.countplot(x='Creditability', hue='Housing', data=df1)
plt.show()
Correlation
[25]: corr = df1[['Creditability', 'Credit amount']].corr()
# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Heatmap')
plt.show()
11
converting categorical value into numerical values
[26]: df1['Sex'] = df1['Sex'].replace({'male': 1, 'female': 0})
12
df1['Checking account'] = df1['Checking account'].replace({'little': 0,␣
↪'moderate': 1, 'rich': 2, 'quite rich': 3})
[32]: df1.drop(['Purpose'],axis=1,inplace=True)
#preparation of datasets
[33]: Predictor = df1[df1.columns[df1.columns != 'Creditability']] # All columns␣
↪except 'Creditability'
Target = df1['Creditability']
X train (700, 8)
Y train (700,)
X test (300, 8)
y test (300,)
#Model Training
[36]: from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,␣
↪classification_report
[37]: logr=LogisticRegression()
logr.fit(X_train,y_train)
y_Pred=logr.predict(X_test)
[38]: y_Pred
[38]: array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
13
0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1])
[39]: conf_matrix=confusion_matrix(y_test,y_Pred)
conf_matrix
#True Positives (TP): This value should be high. It indicates the number of␣
↪actual positives that were correctly identified by the model.
#True Negatives (TN): This value should also be high. It indicates the number␣
↪of actual negatives that were correctly identified.
#False Positives (FP): This value should be low. It indicates the number of␣
↪actual negatives that were incorrectly classified as positives.
#False Negatives (FN): This value should be low. It indicates the number of␣
↪actual positives that were incorrectly classified as negatives.
14
Accuracy = (TP+TN)/(TP+TN+FP+FN)
Precision = (TP)/(TP+FP)
Recall = (TP)/(TP+FN)
[42]: print('Accuracy',accuracy_score(y_test,y_Pred))
print('Precision',logr.score(X_test,y_test))
print('Recall',logr.score(X_test,y_test))
print('F1 score',logr.score(X_test,y_test))
Accuracy 0.9533333333333334
Precision 0.9533333333333334
Recall 0.9533333333333334
F1 score 0.9533333333333334
15
[43]: print(classification_report(y_test,y_Pred))
[48]: y_pred_proba=logr.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
16
[53]: from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
0.9528447232309892
The Model has accuracy rate of 95%
[58]: df1.to_csv('preprocessed_data.csv', index=False)
17