0% found this document useful (0 votes)
22 views26 pages

Loan Prediction-1

Uploaded by

amnwq
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF or read online on Scribd
0% found this document useful (0 votes)
22 views26 pages

Loan Prediction-1

Uploaded by

amnwq
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF or read online on Scribd
You are on page 1/ 26
423923, 12538 AM In [1]: amport pandas as pd Loan Prediction - Jupyter Notebook import matplotlib.pyplot as plt import seaborn as sns import nunpy as ap sns.set_theme(color_codes=True) In [2]: df = pd.read_csv('loan_train.csv') Gender Married Dependents Education Self_Employed Applicant_income Coapplicant_Income Loan df.head() out [2]: 0 Male No 1 Male Yes 2 Male Yes 3 Male Yes 4 Male No 0 1 ° Graduate Graduate Graduate Not Graduate Graduate No No Yes No. No Data Preprocessing Part 1 In [3]: #Check the number of unique value on object datatype df. select_dtypes (includ out[3]: Gender Married Dependents Education Self_Enployed Area Status dtype: intea object") nunique() Exploratory Data Analysis localhost 8890Inotebooks/Loan Prediction ayn ‘584900 458300 300000 258300 600000 00 150800.0, 09 236800. 00 1126 423923, 12538 AM In [4]: Loan Prediction -Jupyter Notebook # List of categorical variables to plot cat_vars = [‘Gender’, ‘Married’, ‘Dependents’, ‘Education’, ‘self Employed’, ‘Area’, ‘Credit History’, ‘Dependents’] # create figure with subplots Fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 10)) axs = axs.flatten() # create barplot for each categorical variable for i, var in enumerate(cat_vars): sns.countplot(x=var, hue='Status’, datasdf, ax=axs[i]) axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90) # adjust spacing between subplots ig. tight_layout() show plot pit. show() z ie C Q i ar localhost 8890Inotebooks/Loan Prediction ayn 4123923, 12338 AM In [5]: locahhost 8890inotebooks/Loan Prediction ayn Loan Prediction -Jupyter Notebook import warnings warnings. filterwarnings(" ignore") # get List of categorical variables cat_vars = [‘Gender', ‘Married’, ‘Dependents’, ‘Education’, ‘self_Employed’, ‘Area’, ‘Credit History’, ‘Dependents'] # create figure with subplots fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 10)) axs = axs.flatten() # create histplot for each categorical variable for i, var in enumerate(cat_vars): sns.histplot(x-var, hue='Status', data-df, ax-axs[i], multiple="Fill” axs[i].set_xticklabels(df[var] .unique(), rotation=90) axs[i].set_xlabel (var) kde-False, ‘# adjust spacing between subplots fig. tight_layout() # show plot plt.show() ik | i =. I fi a 423923, 12538 AM Loan Prediction -Jupyter Notebook In [6]: num_vars = ['Applicant_Income’, ‘Coapplicant_Income', ‘Term'] Fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 7)) axs = axs.flatten() for i, var in enumerate(num_vars): sns.boxplot(x=var, dat: fig.tight_layout() plt.show() localhost 8890Inotebooks/Loan Prediction ayn 426 423923, 12538 AM Loan Prediction -Jupyter Notebook In [7]: num_vars = ['Applicant_Income’, ‘Coapplicant_Income', ‘Term'] Fig, axs = plt-subplots(nrows=1, ncols=3, figsize=(15, 7)) axs = axs.flatten() for i, var in enumerate(num_vars): sns.violinplot(x=var, data=df, a) fig. tight_layout() plt.show() localhost 8890Inotebooks/Loan Prediction ayn 423923, 12538 AM Loan Prediction -Jupyter Notebook In [8]: num_vars fig, axs = plt-subplots(nrows=1, ncols=3, figsiz axs = axs.flatten() as, 7) for i, var in enunerate(num_vars): sns.violinplot (x=var, y='Status', data-df, ax-axs[i]) fig. tight_layout() plt.show() iio aa localhost 8890Inotebooks/Loan Prediction ayn ['Applicant_tncome", "Coapplicant_Incone', ‘Term’ ] ) + + 626 423923, 12538 AM Loan Prediction -Jupyter Notebook In [9]: num_vars = ['Applicant_Income’, ‘Coapplicant_Income', ‘Term'] Fig, axs = plt-subplots(nrows=1, ncols=3, figsize=(15, 7)) axs = axs.flatten() for i, var in enumerate(num_vars): sns.histplot(x=var, data=df, ai fig.tight_layout() plt.show() 7 lu 7 = . — localhost 8890Inotebooks/Loan Prediction ayn 1128 423923, 12538 AM In [10]: num_vars fig, axs Loan Prediction -Jupyter Notebook [‘Applicant_Income’, ‘Coapplicant_Incone’, ‘Term’ ] plt. subplots (nrow: axs.flatten() for i, var in enunerate(num_vars): fig. tight_layout() plt.show() = sns.histplot (: Data Preprocessing Part 2 In [11]: df-head() » ncols=3, figsize=(15, 7)) ar, data=df, hue='Status', ax=axs[i]) oss): Gender Married Dependents Education Self_Employed Applicant Income Coapplicant_Income Loan, 0 Wale We 0 Grote We seeac0 00 1 Mae Yes 1 Grote we asss00 ss00000 2 Mle ves 0 Graduate ves sooneo 00 3 Mle ves © Gat W 2sss00 zsst000 4 tle Wo 0 Grodute " sooito 00 localhost 8890Inotebooks/Loan Prediction ayn 8126 423923, 12538 AM In [12]: out [12]: In [13]: out (13): In [14]: out (14): In [15]: Loan Prediction - Jupyter Notebook heck the missing value check missing = df.isnul1().sum() * 100 / df.shape[o] check_missing[check_missing > 0].sort_values(ascending-False) Credit History 8.143322 Self_Employed 5.211726 Dependents 2.442997 Term 2.280130 Gender 2.117264 Married 0.488599 dtype: floatea # FILL null values with ‘Unknown* df.fillna( ‘Unknown’, inplace=True) #check the missing value again check_missing = df.isnull().sum() * 100 / df.shape[2] check_missing|check_missing > @].sort_values(ascending=False) Series({], dtyp float6a) df.dtypes Gender object Married object Dependents object Education object Self _Enployed object Applicant_Incone intes Coapplicant_Income floate4 Loan_Anount intes Term object Credit History object Area object Status object dtype: object Label Encoding for Object datatype # Loop over each column in the DataFrane where dtype is ‘object’ for col in df.select_dtypes(include=[ ‘object’ ]).columns: # Print the column name and the unique values print(#"{col}: {df[col] .unique()}") Gender: ['Male' 'Female' ‘Unknown" ] Married: ['No' "Yes" 'Unknown'] Dependents: ['@" "1" '2' "3+" "Unknown" } Education: ['Graduate’ 'Not Graduate’ ] Self_Employed: ['No’ "Yes" ‘Unknown’ ] Term: [360.0 120.0 240.2 ‘Unknown’ 180.0 60.0 300.0 480.0 36.0 84.0 12.0] Credit History: [1.@ @.@ "Unknown" ] Area: ['Urban' ‘Rural’ ‘Semiurban" ] Status: ['¥" 'N'] localhost 8890Inotebooks/Loan Prediction ayn 926 423923, 12538 AM In [16]: In [17]: Loan Prediction - Jupyter Notebook # Convert selected columns to string data type AF[[ Term’, ‘Credit History']] = df[[ ‘Term’, ‘Credit History’ ]].astype(str) from sklearn import preprocessing # Loop over each column in the DataFrame where dtype is ‘object’ for col in df.select_dtypes(include=[ ‘object’ ]).columns: # Initialize a LabelEncoder object label_encoder = preprocessing. LabelEncoder() 4 Fit the encoder to the unique values in the column label_encoder .fit (dF [col] .unique()) 4 Transform the column using the encoder df [col] = label_encoder.transform(df[col]) # Print the column name and the unique encoded values print(#"{col}: {df[col] .unique()}") Gender: [1 @ 2] Married: [@ 2 1] Dependents: [@ 12 3 4] Education: [6 1] Self_Employed: [0 2 1] Terms [6 1310 28475 9 @] Credit History: [1 @ 2] Area: [2 @ 1] Status: [1 0] Check if the Label 'Status' is balanced or not localhost 8890Inotebooks/Loan Prediction ayn 10126 423923, 12538 AM Loan Prediction -Jupyter Notebook In [18]: sns.countplot (df[ status" ]) df[ ‘Status’ ].value_counts() out{1s}: 1 422 @ 192 Name: Status, dtype: intes Status Oversampling Minority Class to balance the Label In [19]: from sklearn.utils import resample create tno different dataframe of majority and minority class df_majority = df{(df[ ‘status’ df_minority = df{(dF[ ‘Status’ # upsample minority class df_minority_upsampled = resample(df_minority, replace=true, # sample with replacement ALsamples= 422, # to match majority class randon_state=0) # reproducible results # Combine majority class with upsampled minority class df_upsanpled = pd.concat([d#_minority_upsampled, df majority]) localhost 8890Inotebooks/Loan Prediction ayn 26 423923, 12538 AM Loan Prediction -Jupyter Notebook In [20]: sns.countplot (4f_upsampled| ‘status’ ]) df_upsanpled|[ ‘status’ ].value_counts() out[2e}: @ 422 1 422 Name: Status, dtype: intes 400 350 ‘count 150 100 Status Remove Outi r using IQR because there are alot of extreme value In [21]: df_upsampled. shape out(21]: (a4, 12) localhost 8890Inotebooks/Loan Prediction ayn 1226 423923, 12538 AM In [22]: # specify the columns to remove outliers from dataframe colunn_names = ['Applicant_Incone’ , “coapplicant_Incone’ Loan Prediction - Jupyter Notebook "Term # remove outliers for each selected column using the IQR method for colunn_nane in column_nanes: d¥_upsampled[ colurn_nane] .quantile(@.25) df_upsanpled[ column_nane] .quantile(9.75) a= @ IQR df_upsanpled.head() G-a 4f_upsanpled = df_upsanpled[~((df_upsampled[colunn_name] < (Qt - 1.5 * TOR) | ( Married Dependents Education Self_Employed Applicant_Incomo Coapplicant_Income Lo out [22]: Gender 148 0 338 ° os 1 57 1 107 1 0 o 2 0 a 1 0 1 ° In [23]: #Check the shape after outlier removal. df_upsanpled. shape out[23]: (614, 12) localhost 8890Inotebooks/Loan Prediction ayn 0 ° 1 ° 1000000 182000 371700 336600 1733300 +166600.0 o. 292500.0 220000.0 °. 1926 423923, 12538 AM Loan Prediction -Jupyter Notebook In [24]: plt.figure(Figsize=(15,12)) sns.heatmap(df_upsampled.corr(), fmt: annot=True) out [24]: In [25]: df_upsampled.drop(columns='Term’, inplace=True) Train Test Split 4f_upsampled.drop( ‘Status’, axis=1) ROC Curve 10 08 & True Positive Rate g 02 00 a0 02 a4 06 a8 1.0 False Positive Rate Random Forest 20128 423923, 12538 AM In [39]: In [40]: out [40]: In [41]: In [42]: Loan Prediction - Jupyter Notebook from sklearn.ensemble import RandonForestClassifier from sklearn.model_selection import GridSearchcV rfc = RandonForestClassifier() param_grid = { ‘nestimators': [100, 200], *max_depth': [None, 5, 10], ‘max_features': ["sqrt’, ‘log2', None] } # Perform a grid search with cross-validation to find the best hyperparaneters grid_search = GridSearchCv(rfc, paran_grid, cv=5) grid_search.fit(x_train, y_train) # Print the best hyperparameters print (grid_search.best_parans_) {’max_depth': None, 'max_features': 'log2', 'n_estimators': 200) from sklearn.ensenble import RandonForestClassifier rfc = RandomForestClassifier(random_state=0, max_features="log2", n_estimators=200) rfc. fit(X_train, y_train) RandonForestClassifier(max_feature: 1og2", n_estinators=200, random_stat« y_pred = rfc.predict(X_test) print("Accuracy Score :", round(accuracy_score(y_test, y_pred)*100 ,2), "%") Accuracy Score : 95.12 % from sklearn.metrics inport accuracy_score, f1_score, precision_score, recall_score, print(‘F-1 Score : ',(f1_score(y test, y_pred, average='micro'))) print('Precision Score : ',(precision score(y test, y_pred, average='micro’))) print('Recall Score : ',(recall_score(y test, y_pred, average='micro'))) print(‘Jaccard Score : ',(Jaccard_score(y_test, y_pred, average="micro'))) print(*Log Loss : ', (log loss(y_test, y_pred))) F-1 Score : 0.9512195121951219 Precision Score : @.9512195121951219 Recall Score : @.9512195121951219 Jaccard Score : @.9069767441860465 Log Loss : 1.6848443638958128, localhost 8890Inotebooks/Loan Prediction ayn 21128 423923, 12538 AM In [43]: localhost 8890Inotebooks/Loan Prediction ayn Loan Prediction -Jupyter Notebook imp, pd.DataFrame({ ‘eature Name": X_train. columns, "Importance": rfc. feature_importances_ » Fi = imp_df.sort_values(by="Inportance", ascending=False) fi2 = fi-head(10) plt. figure(figsize=(10,8)) sns. barplot (dat: ‘Importance’, y='Feature Name’) plt.title('Top 10 Feature Importance Each Attributes (Random Forest)’, fontsize=18) plt.xlabel ("Inportance’, fontsize=16) plt.ylabel (‘Feature Name’, fontsize=16) plt. show() Top 10 Feature Importance Each Attributes (Random Forest) Peplicant income ret History Loon_Amount (coappticant income Dependents ea Feature Name Sell Employed Eveaton 8 010 ons Importance 2226 423923, 12538 AM Loan Prediction -Jupyter Notebook In [44]: import shap # compute SHAP values explainer = shap.TreeExplainer(rfc) shap_values = explainer.shap_values(X_test) shap.sunmary plot(shap_values[1], X_test.values, feature_names = X_test.colunns) High Credit History + smeeme e+ Applicant_Income . Loan_Amount tat . Area teehee ° Coapplicant_Income = batterie = 3 Dependents . . 2 Gender ste . Married eh: Self_Employed cote = Education “ 1 1 Pow 46 45 04 03 42 01 00 of 02 SHAP value (impact on model output) localhost 8890Inotebooks/Loan Prediction ayn 2226 423923, 12538 AM Loan Prediction -Jupyter Notebook In [45]: import shap explainer = shap.TreeExplainer(rfc) shap_values = explainer.shap_values(X_test) shap.sunmary_plot(shap_values, X_test) Applicant_Income Loan_Amount Area Coapplicant_income Dependents Gender Married Self_Employed | mmm Class 1 mms Class 0 Education 000 «0050005280 mean(/SHAP value|) (average impact on model output magnitude) localhost 8890Inotebooks/Loan Prediction ayn 24126 4123723, 12:38 AM Loan Presi -Jpyter Notebook In [46]: from sklearn.metrics import confusion matrix cm = confusion matrix(y test, y_pred) plt.figure(figsize=(5,5)) sns-heatmap (data=cm, Linewidths= pit.ylabel(‘Actual label’) plt.xlabel( ‘Predicted label’) all_sanple_title = ‘Accuracy Score for Randon Forest: {@}'.format(rfe.score(X test, y, plt.title(all_sample title, size = 15) 5, annot-True, cmap = ‘Blues') Out[46]: Text(@.5, 1.8, ‘Accuracy Score for Random Forest: @.9512195121951219") Accuracy Score for Random Forest: 0.9512195121951219 60 ° a 4 = 40 30 -20 Actual labo! -10 Predicted label localhost 8890Inotebooks/Loan Prediction ayn 25126 423923, 12538 AM In [47]: out (47): localhost 8890Inotebooks/Loan Prediction ayn Loan Prediction - Jupyter Notebook from sklearn.metrics import roc_curve, roc_auc_score y_pred_proba = rfc.predict_proba(X_test)[:][:, df_actual_predicted = pd.concat([pd.DataFrame(np.array(y_test), colunns=["y_actual']) df_actual_predicted.index = y_test. index for, tpr, tr = roc_curve(df_actual_predicted['y_actual'], df_actual_predictedt 'y_pred auc’ = ro¢_auc_score(df_actual_predicted["y actual], df actual_predicted[ "y pred prob, plt.plot(fpr, tpr, label='AUC = %0.4F" Xauc) pit.plot(#pr, fpr, linestyle = '--', color plt.xlabel(‘False Positive Rate’) plt.ylabel(‘True Positive Rate’) plt.title('ROC Curve’, size = 15) plt.legend() kK) ROC Curve 10 08 & True Positive Rate g 02 oo — Avc=09871 or) 02 a4 06 a8 1.0 False Positive Rate 26128

You might also like