Home Credit Data
Home Credit Data
# Steps :
# Exploratory Data Analysis - Data Preparation and Cleansing (60% of time)
# 1. Measures of Central Tendency
# 2. Measures of Dispersion
# 3. Third Moment Business decision
# 4. Fourth Moment Business decision
# 5. Probability distributions of variables
# 6. Graphical representations
# > Histogram,Box plot,Dot plot,Stem & Leaf plot,
# Bar plot
# We�ll focus on the application_train.csv and application_test.csv files, which
contain a significant amount of useful information for predicting credit default.
# This is the main source of information for people that have applied for personal
loans including features related to their loan application.
# Read database
train <- read_csv("D://home-credit-default-risk//application_train.csv")
# Parsed with column specification:
# cols(
# .default = col_double(),
# NAME_CONTRACT_TYPE = col_character(),
# CODE_GENDER = col_character(),
# FLAG_OWN_CAR = col_character(),
# FLAG_OWN_REALTY = col_character(),
# NAME_TYPE_SUITE = col_character(),
# NAME_INCOME_TYPE = col_character(),
# NAME_EDUCATION_TYPE = col_character(),
# NAME_FAMILY_STATUS = col_character(),
# NAME_HOUSING_TYPE = col_character(),
# OCCUPATION_TYPE = col_character(),
# WEEKDAY_APPR_PROCESS_START = col_character(),
# ORGANIZATION_TYPE = col_character(),
# FONDKAPREMONT_MODE = col_character(),
# HOUSETYPE_MODE = col_character(),
# WALLSMATERIAL_MODE = col_character(),
# EMERGENCYSTATE_MODE = col_character()
# )
#See spec(...) for full column specifications.
dim(train)
#[1] 307511 122
sum(is.na(test))
# [1] 1404419
str(train)
# Classes �spec_tbl_df�, �tbl_df�, �tbl� and 'data.frame': 307511 obs. of 122
variables:
# $ SK_ID_CURR : num 1e+05 1e+05 1e+05 1e+05 1e+05 ...
# $ TARGET : num 1 0 0 0 0 0 0 0 0 0 ...
# $ NAME_CONTRACT_TYPE : chr "Cash loans" "Cash loans" "Revolving loans"
"Cash loans" ...
# $ CODE_GENDER : chr "M" "F" "M" "F" ...
# $ FLAG_OWN_CAR : chr "N" "N" "Y" "N" ...
# $ FLAG_OWN_REALTY : chr "Y" "N" "Y" "Y" ...
# $ CNT_CHILDREN : num 0 0 0 0 0 0 1 0 0 0 ...
# $ AMT_INCOME_TOTAL : num 202500 270000 67500 135000 121500 ...
# $ AMT_CREDIT : num 406598 1293503 135000 312683 513000 ...
# $ AMT_ANNUITY : num 24701 35699 6750 29687 21866 ...
# $ AMT_GOODS_PRICE : num 351000 1129500 135000 297000 513000 ...
# $ NAME_TYPE_SUITE : chr "Unaccompanied" "Family" "Unaccompanied"
"Unaccompanied" ...
# $ NAME_INCOME_TYPE : chr "Working" "State servant" "Working"
"Working" ...
# $ NAME_EDUCATION_TYPE : chr "Secondary / secondary special" "Higher
education" "Secondary / # secondary special" "Secondary / secondary special" ...
# $ NAME_FAMILY_STATUS : chr "Single / not married" "Married" "Single /
not married" "Civil # marriage" ...
# $ NAME_HOUSING_TYPE : chr "House / apartment" "House / apartment"
"House / apartment" "House / # apartment" ...
# $ REGION_POPULATION_RELATIVE : num 0.0188 0.00354 0.01003 0.00802 0.02866 ...
# $ DAYS_BIRTH : num -9461 -16765 -19046 -19005 -19932 ...
# $ DAYS_EMPLOYED : num -637 -1188 -225 -3039 -3038 ...
# $ DAYS_REGISTRATION : num -3648 -1186 -4260 -9833 -4311 ...
# $ DAYS_ID_PUBLISH : num -2120 -291 -2531 -2437 -3458 ...
# $ OWN_CAR_AGE : num NA NA 26 NA NA NA 17 8 NA NA ...
# $ FLAG_MOBIL : num 1 1 1 1 1 1 1 1 1 1 ...
# $ FLAG_EMP_PHONE : num 1 1 1 1 1 1 1 1 0 1 ...
# $ FLAG_WORK_PHONE : num 0 0 1 0 0 1 0 1 0 0 ...
# $ FLAG_CONT_MOBILE : num 1 1 1 1 1 1 1 1 1 1 ...
# $ FLAG_PHONE : num 1 1 1 0 0 1 1 0 0 0 ...
# $ FLAG_EMAIL : num 0 0 0 0 0 0 0 0 0 0 ...
# $ OCCUPATION_TYPE : chr "Laborers" "Core staff" "Laborers"
"Laborers" ...
# $ CNT_FAM_MEMBERS : num 1 2 1 2 1 2 3 2 2 1 ...
# $ REGION_RATING_CLIENT : num 2 1 2 2 2 2 2 3 2 2 ...
# $ REGION_RATING_CLIENT_W_CITY : num 2 1 2 2 2 2 2 3 2 2 ...
# $ WEEKDAY_APPR_PROCESS_START : chr "WEDNESDAY" "MONDAY" "MONDAY"
"WEDNESDAY" ...
# $ HOUR_APPR_PROCESS_START : num 10 11 9 17 11 16 16 16 14 8 ...
# $ REG_REGION_NOT_LIVE_REGION : num 0 0 0 0 0 0 0 0 0 0 ...
# $ REG_REGION_NOT_WORK_REGION : num 0 0 0 0 0 0 0 0 0 0 ...
# $ LIVE_REGION_NOT_WORK_REGION : num 0 0 0 0 0 0 0 0 0 0 ...
# $ REG_CITY_NOT_LIVE_CITY : num 0 0 0 0 0 0 0 0 0 0 ...
# $ REG_CITY_NOT_WORK_CITY : num 0 0 0 0 1 0 0 1 0 0 ...
# $ LIVE_CITY_NOT_WORK_CITY : num 0 0 0 0 1 0 0 1 0 0 ...
# $ ORGANIZATION_TYPE : chr "Business Entity Type 3" "School"
"Government" "Business Entity Type #3" ...
# $ EXT_SOURCE_1 : num 0.083 0.311 NA NA NA ...
# $ EXT_SOURCE_2 : num 0.263 0.622 0.556 0.65 0.323 ...
# $ EXT_SOURCE_3 : num 0.139 NA 0.73 NA NA ...
# $ APARTMENTS_AVG : num 0.0247 0.0959 NA NA NA NA NA NA NA NA ...
# $ BASEMENTAREA_AVG : num 0.0369 0.0529 NA NA NA NA NA NA NA NA ...
# $ YEARS_BEGINEXPLUATATION_AVG : num 0.972 0.985 NA NA NA ...
# $ YEARS_BUILD_AVG : num 0.619 0.796 NA NA NA ...
# $ COMMONAREA_AVG : num 0.0143 0.0605 NA NA NA NA NA NA NA NA ...
# $ ELEVATORS_AVG : num 0 0.08 NA NA NA NA NA NA NA NA ...
# $ ENTRANCES_AVG : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
# $ FLOORSMAX_AVG : num 0.0833 0.2917 NA NA NA ...
# $ FLOORSMIN_AVG : num 0.125 0.333 NA NA NA ...
# $ LANDAREA_AVG : num 0.0369 0.013 NA NA NA NA NA NA NA NA ...
# $ LIVINGAPARTMENTS_AVG : num 0.0202 0.0773 NA NA NA NA NA NA NA NA ...
# $ LIVINGAREA_AVG : num 0.019 0.0549 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAPARTMENTS_AVG : num 0 0.0039 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAREA_AVG : num 0 0.0098 NA NA NA NA NA NA NA NA ...
# $ APARTMENTS_MODE : num 0.0252 0.0924 NA NA NA NA NA NA NA NA ...
# $ BASEMENTAREA_MODE : num 0.0383 0.0538 NA NA NA NA NA NA NA NA ...
# $ YEARS_BEGINEXPLUATATION_MODE: num 0.972 0.985 NA NA NA ...
# $ YEARS_BUILD_MODE : num 0.634 0.804 NA NA NA ...
# $ COMMONAREA_MODE : num 0.0144 0.0497 NA NA NA NA NA NA NA NA ...
# $ ELEVATORS_MODE : num 0 0.0806 NA NA NA NA NA NA NA NA ...
# $ ENTRANCES_MODE : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
# $ FLOORSMAX_MODE : num 0.0833 0.2917 NA NA NA ...
# $ FLOORSMIN_MODE : num 0.125 0.333 NA NA NA ...
# $ LANDAREA_MODE : num 0.0377 0.0128 NA NA NA NA NA NA NA NA ...
# $ LIVINGAPARTMENTS_MODE : num 0.022 0.079 NA NA NA NA NA NA NA NA ...
# $ LIVINGAREA_MODE : num 0.0198 0.0554 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAPARTMENTS_MODE : num 0 0 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAREA_MODE : num 0 0 NA NA NA NA NA NA NA NA ...
# $ APARTMENTS_MEDI : num 0.025 0.0968 NA NA NA NA NA NA NA NA ...
# $ BASEMENTAREA_MEDI : num 0.0369 0.0529 NA NA NA NA NA NA NA NA ...
# $ YEARS_BEGINEXPLUATATION_MEDI: num 0.972 0.985 NA NA NA ...
# $ YEARS_BUILD_MEDI : num 0.624 0.799 NA NA NA ...
# $ COMMONAREA_MEDI : num 0.0144 0.0608 NA NA NA NA NA NA NA NA ...
# $ ELEVATORS_MEDI : num 0 0.08 NA NA NA NA NA NA NA NA ...
# $ ENTRANCES_MEDI : num 0.069 0.0345 NA NA NA NA NA NA NA NA ...
# $ FLOORSMAX_MEDI : num 0.0833 0.2917 NA NA NA ...
# $ FLOORSMIN_MEDI : num 0.125 0.333 NA NA NA ...
# $ LANDAREA_MEDI : num 0.0375 0.0132 NA NA NA NA NA NA NA NA ...
# $ LIVINGAPARTMENTS_MEDI : num 0.0205 0.0787 NA NA NA NA NA NA NA NA ...
# $ LIVINGAREA_MEDI : num 0.0193 0.0558 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAPARTMENTS_MEDI : num 0 0.0039 NA NA NA NA NA NA NA NA ...
# $ NONLIVINGAREA_MEDI : num 0 0.01 NA NA NA NA NA NA NA NA ...
# $ FONDKAPREMONT_MODE : chr "reg oper account" "reg oper account" NA
NA ...
# $ HOUSETYPE_MODE : chr "block of flats" "block of flats" NA NA ...
# $ TOTALAREA_MODE : num 0.0149 0.0714 NA NA NA NA NA NA NA NA ...
# $ WALLSMATERIAL_MODE : chr "Stone, brick" "Block" NA NA ...
# $ EMERGENCYSTATE_MODE : chr "No" "No" NA NA ...
# $ OBS_30_CNT_SOCIAL_CIRCLE : num 2 1 0 2 0 0 1 2 1 2 ...
# $ DEF_30_CNT_SOCIAL_CIRCLE : num 2 0 0 0 0 0 0 0 0 0 ...
# $ OBS_60_CNT_SOCIAL_CIRCLE : num 2 1 0 2 0 0 1 2 1 2 ...
# $ DEF_60_CNT_SOCIAL_CIRCLE : num 2 0 0 0 0 0 0 0 0 0 ...
# $ DAYS_LAST_PHONE_CHANGE : num -1134 -828 -815 -617 -1106 ...
# $ FLAG_DOCUMENT_2 : num 0 0 0 0 0 0 0 0 0 0 ...
# $ FLAG_DOCUMENT_3 : num 1 1 0 1 0 1 0 1 1 0 ...
# $ FLAG_DOCUMENT_4 : num 0 0 0 0 0 0 0 0 0 0 ...
# [list output truncated]
# - attr(*, "spec")=
# .. cols(
# .. SK_ID_CURR = col_double(),
# .. TARGET = col_double(),
# .. NAME_CONTRACT_TYPE = col_character(),
# .. CODE_GENDER = col_character(),
# .. FLAG_OWN_CAR = col_character(),
# .. FLAG_OWN_REALTY = col_character(),
# .. CNT_CHILDREN = col_double(),
# .. AMT_INCOME_TOTAL = col_double(),
# .. AMT_CREDIT = col_double(),
# .. AMT_ANNUITY = col_double(),
# .. AMT_GOODS_PRICE = col_double(),
# .. NAME_TYPE_SUITE = col_character(),
# .. NAME_INCOME_TYPE = col_character(),
# .. NAME_EDUCATION_TYPE = col_character(),
# .. NAME_FAMILY_STATUS = col_character(),
# .. NAME_HOUSING_TYPE = col_character(),
# .. REGION_POPULATION_RELATIVE = col_double(),
# .. DAYS_BIRTH = col_double(),
# .. DAYS_EMPLOYED = col_double(),
# .. DAYS_REGISTRATION = col_double(),
# .. DAYS_ID_PUBLISH = col_double(),
# .. OWN_CAR_AGE = col_double(),
# .. FLAG_MOBIL = col_double(),
# .. FLAG_EMP_PHONE = col_double(),
# .. FLAG_WORK_PHONE = col_double(),
# .. FLAG_CONT_MOBILE = col_double(),
# .. FLAG_PHONE = col_double(),
# .. FLAG_EMAIL = col_double(),
# .. OCCUPATION_TYPE = col_character(),
# .. CNT_FAM_MEMBERS = col_double(),
# .. REGION_RATING_CLIENT = col_double(),
# .. REGION_RATING_CLIENT_W_CITY = col_double(),
# .. WEEKDAY_APPR_PROCESS_START = col_character(),
# .. HOUR_APPR_PROCESS_START = col_double(),
# .. REG_REGION_NOT_LIVE_REGION = col_double(),
# .. REG_REGION_NOT_WORK_REGION = col_double(),
# .. LIVE_REGION_NOT_WORK_REGION = col_double(),
# .. REG_CITY_NOT_LIVE_CITY = col_double(),
# .. REG_CITY_NOT_WORK_CITY = col_double(),
# .. LIVE_CITY_NOT_WORK_CITY = col_double(),
# .. ORGANIZATION_TYPE = col_character(),
# .. EXT_SOURCE_1 = col_double(),
# .. EXT_SOURCE_2 = col_double(),
# .. EXT_SOURCE_3 = col_double(),
# .. APARTMENTS_AVG = col_double(),
# .. BASEMENTAREA_AVG = col_double(),
# .. YEARS_BEGINEXPLUATATION_AVG = col_double(),
# .. YEARS_BUILD_AVG = col_double(),
# .. COMMONAREA_AVG = col_double(),
# .. ELEVATORS_AVG = col_double(),
# .. ENTRANCES_AVG = col_double(),
# .. FLOORSMAX_AVG = col_double(),
# .. FLOORSMIN_AVG = col_double(),
# .. LANDAREA_AVG = col_double(),
# .. LIVINGAPARTMENTS_AVG = col_double(),
# .. LIVINGAREA_AVG = col_double(),
# .. NONLIVINGAPARTMENTS_AVG = col_double(),
# .. NONLIVINGAREA_AVG = col_double(),
# .. APARTMENTS_MODE = col_double(),
# .. BASEMENTAREA_MODE = col_double(),
# .. YEARS_BEGINEXPLUATATION_MODE = col_double(),
# .. YEARS_BUILD_MODE = col_double(),
# .. COMMONAREA_MODE = col_double(),
# .. ELEVATORS_MODE = col_double(),
# .. ENTRANCES_MODE = col_double(),
# .. FLOORSMAX_MODE = col_double(),
# .. FLOORSMIN_MODE = col_double(),
# .. LANDAREA_MODE = col_double(),
# .. LIVINGAPARTMENTS_MODE = col_double(),
# .. LIVINGAREA_MODE = col_double(),
# .. NONLIVINGAPARTMENTS_MODE = col_double(),
# .. NONLIVINGAREA_MODE = col_double(),
# .. APARTMENTS_MEDI = col_double(),
# .. BASEMENTAREA_MEDI = col_double(),
# .. YEARS_BEGINEXPLUATATION_MEDI = col_double(),
# .. YEARS_BUILD_MEDI = col_double(),
# .. COMMONAREA_MEDI = col_double(),
# .. ELEVATORS_MEDI = col_double(),
# .. ENTRANCES_MEDI = col_double(),
# .. FLOORSMAX_MEDI = col_double(),
# .. FLOORSMIN_MEDI = col_double(),
# .. LANDAREA_MEDI = col_double(),
# .. LIVINGAPARTMENTS_MEDI = col_double(),
# .. LIVINGAREA_MEDI = col_double(),
# .. NONLIVINGAPARTMENTS_MEDI = col_double(),
# .. NONLIVINGAREA_MEDI = col_double(),
# .. FONDKAPREMONT_MODE = col_character(),
# .. HOUSETYPE_MODE = col_character(),
# .. TOTALAREA_MODE = col_double(),
# .. WALLSMATERIAL_MODE = col_character(),
# .. EMERGENCYSTATE_MODE = col_character(),
# .. OBS_30_CNT_SOCIAL_CIRCLE = col_double(),
# .. DEF_30_CNT_SOCIAL_CIRCLE = col_double(),
# .. OBS_60_CNT_SOCIAL_CIRCLE = col_double(),
# .. DEF_60_CNT_SOCIAL_CIRCLE = col_double(),
# .. DAYS_LAST_PHONE_CHANGE = col_double(),
# .. FLAG_DOCUMENT_2 = col_double(),
# .. FLAG_DOCUMENT_3 = col_double(),
# .. FLAG_DOCUMENT_4 = col_double(),
# .. FLAG_DOCUMENT_5 = col_double(),
# .. FLAG_DOCUMENT_6 = col_double(),
# .. FLAG_DOCUMENT_7 = col_double(),
# .. FLAG_DOCUMENT_8 = col_double(),
# .. FLAG_DOCUMENT_9 = col_double(),
# .. FLAG_DOCUMENT_10 = col_double(),
# .. FLAG_DOCUMENT_11 = col_double(),
# .. FLAG_DOCUMENT_12 = col_double(),
# .. FLAG_DOCUMENT_13 = col_double(),
# .. FLAG_DOCUMENT_14 = col_double(),
# .. FLAG_DOCUMENT_15 = col_double(),
# .. FLAG_DOCUMENT_16 = col_double(),
# .. FLAG_DOCUMENT_17 = col_double(),
# .. FLAG_DOCUMENT_18 = col_double(),
# .. FLAG_DOCUMENT_19 = col_double(),
# .. FLAG_DOCUMENT_20 = col_double(),
# .. FLAG_DOCUMENT_21 = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_HOUR = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_DAY = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_WEEK = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_MON = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_QRT = col_double(),
# .. AMT_REQ_CREDIT_BUREAU_YEAR = col_double()
# .. )