2_Data_Analysis.ipynb - Colaboratory
2_Data_Analysis.ipynb - Colaboratory
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
Mounted at /content/drive
'OpenCreditLines',
'TotalCreditLinespast7years',
'OpenRevolvingAccounts',
'OpenRevolvingMonthlyPayment',
'InquiriesLast6Months',
'TotalInquiries',
'CurrentDelinquencies',
'AmountDelinquent',
'DelinquenciesLast7Years',
'PublicRecordsLast10Years',
'PublicRecordsLast12Months',
'RevolvingCreditBalance',
'BankcardUtilization',
'AvailableBankcardCredit',
'TotalTrades',
'TradesNeverDelinquent (percentage)',
'TradesOpenedLast6Months',
'DebtToIncomeRatio',
'IncomeRange',
'IncomeVerifiable',
'StatedMonthlyIncome',
'LoanKey',
'TotalProsperLoans',
'TotalProsperPaymentsBilled',
'OnTimeProsperPayments',
'ProsperPaymentsLessThanOneMonthLate',
'ProsperPaymentsOneMonthPlusLate',
'ProsperPrincipalBorrowed',
'ProsperPrincipalOutstanding',
'ScorexChangeAtTimeOfListing',
'LoanCurrentDaysDelinquent',
'LoanFirstDefaultedCycleNumber',
'LoanMonthsSinceOrigination',
'LoanNumber',
'LoanOriginalAmount',
'LoanOriginationDate',
'LoanOriginationQuarter',
'MemberKey',
'MonthlyLoanPayment',
'LP_CustomerPayments',
'LP_CustomerPrincipalPayments',
'LP_InterestandFees',
'LP_ServiceFees',
'LP_CollectionFees',
'LP_GrossPrincipalLoss',
'LP_NetPrincipalLoss',
'LP_NonPrincipalRecoverypayments',
'PercentFunded',
'Recommendations',
'InvestmentFromFriendsCount',
'InvestmentFromFriendsAmount',
'Investors',
'LoanOriginationYear',
'LoanMonthsSinceOriginationY',
'LoanFirstDefaultedCycleNumberQ',
'bad_aux',
'population',
'bad']
on the one hand, information/characteristics of the application or the applicant at the time of the application
on the other hand, information on his subsequent behaviour
The first one is used to model the target and the second one, to define the target.
others that talk about the behavior of the loan (post-grant), so we can not use them for modeling purposes
and we also observe identifiers, that in no case are characteristics of the applicant/application
keyboard_arrow_down Drop columns that they can not be used for modeling purposes
drop = ['CreditGrade', #data not filled post 2009
'BorrowerRate', #RATE: price is set after risk assesment
'LenderYield', #Yield: int rate less servicing fee
'EstimatedEffectiveYield', #prosper model generated data
'EstimatedLoss', #prosper model generated data
'EstimatedReturn', #prosper model generated data
'ProsperRating (numeric)', #prosper model generated data
'CurrentlyInGroup', #post origination date
'GroupKey', #ID
'DateCreditPulled', #drop - date when credit score was pulled
'FirstRecordedCreditLine', #need to transform from date to numeric
'LoanKey', #ID
'LoanCurrentDaysDelinquent', #part of target definition, post origination date
'LoanFirstDefaultedCycleNumber', #part of target definition, post origination date
'LoanMonthsSinceOrigination', #part of target definition, post origination date
'LoanNumber', #ID
'LoanOriginationDate', #RAW date
'LoanOriginationQuarter', #REVIEW
'MemberKey', #ID
'LP_CustomerPayments', #post origination date
'LP_CustomerPrincipalPayments', #post origination date
'LP_InterestandFees', #post origination date
'LP_ServiceFees', #post origination date
'LP_CollectionFees', #post origination date
'LP_GrossPrincipalLoss', #post origination date
'LP_NetPrincipalLoss', #post origination date
'LP_NonPrincipalRecoverypayments',#post origination date
'ListingKey', #ID
'ListingNumber', #ID
'ListingCreationDate', #Creation date
'LoanStatus', #post origination date
'ClosedDate', #post origination date
'BorrowerAPR', #price is set after risk assesment
'ProsperRating (Alpha)', #price is set after risk assesment
'ProsperScore', #prosper model generated data
'CurrentDelinquencies', #post origination date
'AmountDelinquent', #post origination date
'InvestmentFromFriendsCount', #post listing date
'InvestmentFromFriendsAmount', #post listing date
'Investors', #post listing date
'PW', #PW flag - drop
'fraud', #fraud flag - drop
'bad', #bad flag - drop
'indeterm', #indeterm flag - drop
'LoanOriginationYear', #origination date - drop
'LoanMonthsSinceOriginationY', #post origination date
'LoanFirstDefaultedCycleNumberQ', #post origination date
'bad_aux', #bad flag - drop
'population'
]
features
['Term',
'ListingCategory (numeric)',
'BorrowerState',
'Occupation',
'EmploymentStatus',
'EmploymentStatusDuration',
'IsBorrowerHomeowner',
'CreditScoreRangeLower',
'CreditScoreRangeUpper',
'CurrentCreditLines',
'OpenCreditLines',
'TotalCreditLinespast7years',
'OpenRevolvingAccounts',
'OpenRevolvingMonthlyPayment',
'InquiriesLast6Months',
'TotalInquiries',
'DelinquenciesLast7Years',
'PublicRecordsLast10Years',
'PublicRecordsLast12Months',
'RevolvingCreditBalance',
'BankcardUtilization',
'AvailableBankcardCredit',
'TotalTrades',
'TradesNeverDelinquent (percentage)',
'TradesOpenedLast6Months',
'DebtToIncomeRatio',
'IncomeRange',
'IncomeVerifiable',
'StatedMonthlyIncome',
'TotalProsperLoans',
'TotalProsperPaymentsBilled',
'OnTimeProsperPayments',
'ProsperPaymentsLessThanOneMonthLate',
'ProsperPaymentsOneMonthPlusLate',
'ProsperPrincipalBorrowed',
'ProsperPrincipalOutstanding',
'ScorexChangeAtTimeOfListing',
'LoanOriginalAmount',
'MonthlyLoanPayment',
'PercentFunded',
'Recommendations']
keyboard_arrow_down Let´s explore first the type and meaning of the variables
Examples:
EmploymentStatusDuration -> How long the employee has been employed (float64)
BankcardUtilization -> The % of available revolving credit that is utilized at the time of application (float64)
DebtToIncomeRatio -> The debt to income ratio of the borrower at the time of application (float64)
BorrowerState -> The two letter abbreviation of the state of the address of the borrower at the time of application (object)
IncomeRange -> The income range of the borrower at the time application (object)
print(df[features].dtypes)
Term int64
ListingCategory (numeric) int64
BorrowerState object
Occupation object
EmploymentStatus object
EmploymentStatusDuration float64
IsBorrowerHomeowner bool
CreditScoreRangeLower int64
CreditScoreRangeUpper int64
CurrentCreditLines int64
OpenCreditLines int64
TotalCreditLinespast7years int64
OpenRevolvingAccounts int64
OpenRevolvingMonthlyPayment int64
InquiriesLast6Months int64
TotalInquiries int64
DelinquenciesLast7Years int64
PublicRecordsLast10Years int64
PublicRecordsLast12Months int64
RevolvingCreditBalance int64
BankcardUtilization float64
AvailableBankcardCredit int64
TotalTrades int64
TradesNeverDelinquent (percentage) float64
TradesOpenedLast6Months int64
DebtToIncomeRatio float64
IncomeRange object
IncomeVerifiable bool
StatedMonthlyIncome float64
TotalProsperLoans float64
TotalProsperPaymentsBilled float64
OnTimeProsperPayments float64
ProsperPaymentsLessThanOneMonthLate float64
ProsperPaymentsOneMonthPlusLate float64
ProsperPrincipalBorrowed float64
ProsperPrincipalOutstanding float64
ScorexChangeAtTimeOfListing float64
LoanOriginalAmount int64
MonthlyLoanPayment float64
PercentFunded float64
Recommendations int64
dtype: object
df[df['BankcardUtilization']>0.0]['BankcardUtilization'].head(5)
0 0.49
1 0.35
2 0.43
3 0.07
4 0.21
Name: BankcardUtilization, dtype: float64
df[df['TotalProsperLoans']>0.0]['TotalProsperLoans'].head(5)
3 1.0
10 1.0
12 1.0
15 1.0
17 2.0
Name: TotalProsperLoans, dtype: float64
df[num_features].describe().transpose()
count mean std min
df[cat_features].describe().transpose()
features_report = ['Occupation','OpenCreditLines','CurrentCreditLines','DebtToIncomeRatio','TotalProsperLoans','RevolvingCre
profile = ProfileReport(df[features_report])
profile.to_file("output.html")
profile
Overview
Dataset statistics
Number of variables 6
Duplicate rows 64
Variable types
Text 1
Numeric 5
Alerts
Dataset has 64 (0.3%) duplicate rows Duplicates
variables with too much granularity (example occupation) -> we could think of grouping it according to the observed BR, without losing
sight of the sense of Business
correlations between variables -> it depends on the modeling methodology we are going to use, we should be concerned about this issue
to a greater or lesser extent
variables with a high percentage of missing/zero values -> it is important to ask whether it makes sense to record such a percentage of
missing/zeros, and the treatment we want to make of them
EJERCICIO!!!
DTI_informed = df.loc[~df.DebtToIncomeRatio.isna()]
DTI_missing = df.loc[df.DebtToIncomeRatio.isna()]
Large differences can be observed in terms of BR between the population with informed DTI and the population with missing DTI
Does DTI being missing mean that other basic information is also missing?
DTI_missing[['DebtToIncomeRatio','StatedMonthlyIncome','LoanOriginalAmount','MonthlyLoanPayment']].head(5)
(0, 87)
Given the differences in terms of BR, and given that it does not imply that other basic variables are missing, we keep these clients within the
population.
So, when we talk about the discriminatory power of a variable we are talking about its capacity to give information about the characteristics of
the bad payer, in contrast to those of the good payer.
These functions handle a basic bucketing schema. For numeric features it keeps the data between the "input_slider" percentiles, and splits the
data in "n_bins". For categorical data, it keeps top "n_bins" categories.
#visualization functions
def capture_df(feat_col, input_slider, n_bins, df, target_col):
"""
Handles the type of the data to generate the intermediate datadframe
"""
if df[feat_col].dtype in [int, float, np.number]:
return df_vol_br_num(feat_col, input_slider, n_bins, df, target_col)
else:
return df_vol_br_cat(feat_col, input_slider, n_bins, df, target_col)
)
return {'data': [vol_bars, tr_line, avg_line],
'layout': layout}
A variable with high predictive power is a variable that is capable of giving information about the bad payer, in contrast to the good payer. In
other words, it is capable of separating the two behaviors. In the case of a categorical variable -> a variable with high predictive power is a
variable for which certain categories concentrate a high volume of bad payers and others, a low volume.
cuts N BR
In the case of a numerical variable -> a variable with high predictive power is a variable for which certain ranges concentrate a high volume of
bad payers and others, a low volume.
0 NA 2249 0.171187
Args:
df_vbr: Pandas DataFrame, containing volume counts and bad counts
col_target: Name of the target column
Returns:
Estimated IV
"""
#calculate the IV
#bin-wise good/bad count
N_bad_bin = ((df_vbr[col_target]) * df_vbr.N).round()
N_good_bin = df_vbr.N - N_bad_bin
#total good-bads
N_bad = N_bad_bin.sum()
N_good = N_good_bin.sum()
#binwise dist of good-bads
dist_goods = N_good_bin / N_good
dist_bads = N_bad_bin / N_bad
#Binwise Woe
woe = np.log(dist_goods / (dist_bads + 0.00000001))
#Binwise IV
iv = (dist_goods - dist_bads) * woe
return iv
ivs = []
#for all features
for c in features:
#split the feature in 10 buckets, and get the volume of observations and default per bin
df_tr, avg_br = capture_df(c, [0., 97.5], 10, df, 'bad')
#estimate the iv using that binning
ivs.append(get_iv(df_tr, col_target='BR').sum())
df_iv = pd.DataFrame({'feature': features,
'IV': ivs}).sort_values(by='IV', ascending=False)
#show sorted by IV
df_iv
bad Term Term_
0 False 36 (28.0, 44.0]
1 False 36 (28.0, 44.0]
2 False 36 (28.0, 44.0]
3 False 36 (28.0, 44.0]
4 False 36 (28.0, 44.0]
... ... ... ...
20700 False 36 (28.0, 44.0]
20701 False 60 (44.0, 60.0]
20702 False 36 (28.0, 44.0]
20703 False 36 (28.0, 44.0]
20704 False 12 (11.999, 28.0]
TradesNeverDelinquent (percentage)_
0 (0.632, 0.724]
1 (0.908, 1.0]
2 (0.908, 1.0]
3 (0.908, 1.0]
4 (0.908, 1.0]
... ...
20700 (0.908, 1.0]
20701 (0.816, 0.908]
20702 (0.908, 1.0]
20703 (0.724, 0.816]
20704 (0.632, 0.724]
ProsperPaymentsLessThanOneMonthLate_
3 (-0.001, 0.889]
10 (-0.001, 0.889]
12 (-0.001, 0.889]
15 (-0.001, 0.889]
17 (-0.001, 0.889]
... ...
20682 (-0.001, 0.889]
20686 (-0.001, 0.889]
20687 (-0.001, 0.889]
20690 (-0.001, 0.889]
20697 (-0.001, 0.889]
28 StatedMonthlyIncome 0.148338
26 IncomeRange 0.142623
25 DebtToIncomeRatio 0.109896
7 CreditScoreRangeLower 0.084819
8 CreditScoreRangeUpper 0.084819
36 ScorexChangeAtTimeOfListing 0.077857
3 Occupation 0.075511
31 OnTimeProsperPayments 0.073233
4 EmploymentStatus 0.064521
30 TotalProsperPaymentsBilled 0.060063
38 MonthlyLoanPayment 0.059123
14 InquiriesLast6Months 0.059109
37 LoanOriginalAmount 0.055845
10 OpenCreditLines 0.055633
9 CurrentCreditLines 0.050520
12 OpenRevolvingAccounts 0.050037
35 ProsperPrincipalOutstanding 0.049404
27 IncomeVerifiable 0.048851
22 TotalTrades 0.046902
32 ProsperPaymentsLessThanOneMonthLate 0.041367
11 TotalCreditLinespast7years 0.033806
24 TradesOpenedLast6Months 0.033370
2 BorrowerState 0.030815
34 ProsperPrincipalBorrowed 0.027194
20 BankcardUtilization 0.026802
13 OpenRevolvingMonthlyPayment 0.026497
29 TotalProsperLoans 0.023694
6 IsBorrowerHomeowner 0.022738
21 AvailableBankcardCredit 0.019976
33 ProsperPaymentsOneMonthPlusLate 0.017298
5 EmploymentStatusDuration 0.016949
15 TotalInquiries 0.013414
0 Term 0.012167
19 RevolvingCreditBalance 0.009131
39 PercentFunded 0.005642
17 PublicRecordsLast10Years 0.002602
16 DelinquenciesLast7Years 0.002380
18 PublicRecordsLast12Months 0.000000
40 Recommendations 0.000000
28 StatedMonthlyIncome 0.148338
26 IncomeRange 0.142623
17 PublicRecordsLast10Years 0.002602
16 DelinquenciesLast7Years 0.002380
#plot features
for c in df_iv_loc.feature.values.tolist():
py.iplot(output_graph_update(c, [0., 97.5], 6, df, 'bad'))
bad StatedMonthlyIncome StatedMonthlyIncome_
0 False 5916.666667 (5277.778, 7916.667]
1 False 4166.666667 (2638.889, 5277.778]
2 False 7083.333333 (5277.778, 7916.667]
3 False 3166.666667 (2638.889, 5277.778]
4 False 3750.000000 (2638.889, 5277.778]
... ... ... ...
20700 False 15800.000000 (13194.444, 15833.333]
20701 False 4304.333333 (2638.889, 5277.778]
20702 False 2416.666667 (-0.001, 2638.889]
20703 False 4166.666667 (2638.889, 5277.778]
20704 False 6000.000000 (5277.778, 7916.667]
BR for StatedMonthlyIncome
8000
7000
6000
5000
Volume
4000
3000
2000
1000
0
(-0.001, 2638.889] (2638.889, 5277.778] (5277.778, 7916.667]
BR for IncomeRange
6000
5000
4000
Volume
3000
2000
1000
0
$1-24,999 $100,000+ $25,000-49,999
16k
14k
12k
10k
Volume
8k
6k
4k
2k
0
(-0.001, 0.667] (0.667, 1.
BR for DelinquenciesLast7Years
16k
14k
12k
10k
Volume
8k
6k
4k
2k
0
(-0.001, 5.333] (5.333, 10.667] (10.667, 16.0]
In simple words, Population Stability Index (PSI) compares the distribution of a scoring variable (predicted probability) in scoring data set to a
training data set that was used to develop the model. The idea is to check "How the current scoring is compared to the predicted probability
from training data set".
def PSI_numeric(series, in_out_time_series):
"""Returns the population stability index for numerical variables
Args:
series: Pandas Series, the variable to describe
in_out_time_series: Pandas Series It contains the in time / out of time series
Returns:
Estimated PSI
"""
pd_aux = pd.DataFrame(dict(data = series, in_out = in_out_time_series)).reset_index()
#capture in time and out of time series
in_series = pd_aux.loc[pd_aux.in_out == True]['data']
out_series = pd_aux.loc[pd_aux.in_out == False]['data']
return sum((in_grp-out_grp)*np.log(in_grp/out_grp))
Args:
series: Pandas Series, the variable to describe
in_out_time_series: Pandas Series It contains the in time / out of time series
Returns:
Estimated PSI
"""
pd_aux = pd.DataFrame(dict(data = series, in_out = in_out_time_series)).reset_index()
#capture in time and out of time series
in_series = pd_aux.loc[pd_aux.in_out == True]['data']
out_series = pd_aux.loc[pd_aux.in_out == False]['data']
40 Recommendations 0.000000
18 PublicRecordsLast12Months 0.000000
27 IncomeVerifiable 0.000071
6 IsBorrowerHomeowner 0.000213
39 PercentFunded 0.000411
17 PublicRecordsLast10Years 0.002025
12 OpenRevolvingAccounts 0.002125
19 RevolvingCreditBalance 0.002709
10 OpenCreditLines 0.003613
11 TotalCreditLinespast7years 0.004108
14 InquiriesLast6Months 0.006294
22 TotalTrades 0.006907
9 CurrentCreditLines 0.007141
13 OpenRevolvingMonthlyPayment 0.007249
21 AvailableBankcardCredit 0.008571
20 BankcardUtilization 0.011055
28 StatedMonthlyIncome 0.011694
26 IncomeRange 0.015400
33 ProsperPaymentsOneMonthPlusLate 0.017845
32 ProsperPaymentsLessThanOneMonthLate 0.020497
25 DebtToIncomeRatio 0.021003
15 TotalInquiries 0.022078
16 DelinquenciesLast7Years 0.024698
24 TradesOpenedLast6Months 0.032809
2 BorrowerState 0.033648
29 TotalProsperLoans 0.037846
3 Occupation 0.047130
5 EmploymentStatusDuration 0.050429
34 ProsperPrincipalBorrowed 0.061380
8 CreditScoreRangeUpper 0.080324
7 CreditScoreRangeLower 0.080324
35 ProsperPrincipalOutstanding 0.112830
31 OnTimeProsperPayments 0.118072
30 TotalProsperPaymentsBilled 0.121264
36 ScorexChangeAtTimeOfListing 0.216728
37 LoanOriginalAmount 0.373789
38 MonthlyLoanPayment 0.780153
0 Term 0.977066
4 EmploymentStatus 2.404796
Args:
df: Pandas DataFrame with the in time input data
df_oot: Pandas DataFrame with the out of time input data
col: Name of the column with the feature under study
Returns:
Dictionary that contains the main statistics of the feature
"""
#dictionary to keep main statistics
dict_stats = {'Mean': df[col].mean(),
'Median': df[col].median(),
'Min': df[col].min(),
'Max': df[col].max(),
'p25': df[col].quantile(0.25),
'p75': df[col].quantile(0.75),
'Std': df[col].std(),
'%NA': 100. * df[col].isna().sum() / df.shape[0],
'%Nonzero': 100. * (df[col] != 0).sum() / df.shape[0],
'%Unique': 100. * df[col].nunique() / df.shape[0]}
#plot data distribution
plt_mn, plt_mx = df[col].quantile(q=[0.025, 0.975])
df[col].hist(range=(plt_mn, plt_mx))
plt.show()
#plot stability distribution
plt.hist(df[col], label='IT', range=(plt_mn, plt_mx), alpha=0.55)
plt.hist(df_oot[col], label='OOT', range=(plt_mn, plt_mx), alpha=0.55)
plt.legend(loc='upper right')
plt.show()
return dict_stats
Args:
df: Pandas DataFrame with the in time input data
df_oot: Pandas DataFrame with the out of time input data
col: Name of the column with the feature under study
Returns:
Dictionary that contains the main statistics of the feature
"""
#dictionary to keep main statistics
dict_stats = {'Unique': df[col].nunique(),
'%Unique': 100. * df[col].nunique() / df.shape[0],
'Top': df[col].value_counts()[:1].index[0],
'Freq @ top': df[col].value_counts()[:1].values[0],
'%NA': 100. * df[col].isna().sum() / df.shape[0]
}
#plot data distribution (only top 15)
df[col].value_counts()[:15].sort_values().plot(kind='barh', colormap='Blues_r')
plt.show()
#plot stability distribution
it_vc = df[col].value_counts()[:15]
oot_vc = df_oot[col].value_counts()[:15]
df_vc = pd.DataFrame({'it_vc': 100. * it_vc / it_vc.sum(),
'oot_vc': 100. * oot_vc / oot_vc.sum()})
df_vc = df_vc.fillna(0.)
df_vc = df_vc.sort_values(by='it_vc', ascending=False)
plt.bar(np.arange(df_vc.shape[0]), df_vc.it_vc, color = 'b', width = 0.25, label='IT')
plt.bar(np.arange(df_vc.shape[0]) + 0.25, df_vc.oot_vc, color = 'g', width = 0.25, label='OOT')
plt.xticks(np.arange(df_vc.shape[0]), df_vc.index.values, rotation='vertical')
plt.legend()
plt.show()
return dict_stats