Chapter 5 - Classification Problems
Chapter 5 - Classification Problems
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
checkin_acc 1000 non-null object
duration 1000 non-null int64
credit_history 1000 non-null object
amount 1000 non-null int64
savings_acc 1000 non-null object
present_emp_since 1000 non-null object
inst_rate 1000 non-null int64
personal_status 1000 non-null object
residing_since 1000 non-null int64
age 1000 non-null int64
inst_plans 1000 non-null object
num_credits 1000 non-null int64
job 1000 non-null object
status 1000 non-null int64
dtypes: int64(7), object(7)
memory usage: 109.5+ KB
credit_df.iloc[0:5,1:7]
credit_df.status.value_counts()
0 700
1 300
Name: status, dtype: int64
['checkin_acc',
'duration',
'credit_history',
'amount',
'savings_acc',
'present_emp_since',
'inst_rate',
'personal_status',
'residing_since',
'age',
'inst_plans',
'num_credits',
'job']
['duration',
'amount',
'inst_rate',
'residing_since',
'age',
'num_credits',
'checkin_acc_A12',
'checkin_acc_A13',
'checkin_acc_A14',
'credit_history_A31',
'credit_history_A32',
'credit_history_A33',
'credit_history_A34',
'savings_acc_A62',
'savings_acc_A63',
'savings_acc_A64',
'savings_acc_A65',
'present_emp_since_A72',
'present_emp_since_A73',
'present_emp_since_A74',
'present_emp_since_A75',
'personal_status_A92',
'personal_status_A93',
'personal_status_A94',
'inst_plans_A142',
'inst_plans_A143',
'job_A172',
'job_A173',
'job_A174']
encoded_credit_df[['checkin_acc_A12',
'checkin_acc_A13',
'checkin_acc_A14']].head(5)
0 0 0 0
1 1 0 0
2 0 0 1
3 0 0 0
4 0 0 0
import statsmodels.api as sm
Y = credit_df.status
X = sm.add_constant( encoded_credit_df )
import statsmodels.api as sm
def get_significant_vars( lm ):
var_p_vals_df = pd.DataFrame( lm.pvalues )
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals', 'vars']
return list( var_p_vals_df[var_p_vals_df.pvals <= 0.05]['vars'] )
significant_vars
['duration',
'amount',
'inst_rate',
'age',
'checkin_acc_A13',
'checkin_acc_A14',
'credit_history_A34',
'savings_acc_A65']
actual predicted_prob
557 1 0.080493
798 0 0.076653
977 0 0.345979
136 0 0.249919
575 0 0.062264
544 0 0.040768
332 1 0.833093
917 1 0.370667
678 0 0.388392
363 0 0.088952
y_pred_df['predicted'] = y_pred_df.predicted_prob.map(
lambda x: 1 if x > 0.5 else 0)
557 1 0.080493 0
798 0 0.076653 0
977 0 0.345979 0
136 0 0.249919 0
575 0 0.062264 0
544 0 0.040768 0
332 1 0.833093 1
917 1 0.370667 0
678 0 0.388392 0
363 0 0.088952 0
draw_cm( y_pred_df.actual,
y_pred_df.predicted )
0.78
draw_cm( y_pred_df.actual,
y_pred_df.predicted_new)
print(metrics.classification_report( y_pred_df.actual,
y_pred_df.predicted_new ))
idx = 0
prob cost
4 0.14 150.0
12 0.22 153.0
2 0.12 154.0
10 0.20 154.0
9 0.19 156.0
y_pred_df['predicted_using_cost'] = y_pred_df.predicted_prob.map(
lambda x: 1 if x > 0.14 else 0)
draw_cm( y_pred_df.actual,
y_pred_df.predicted_using_cost )
bank_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
age 4521 non-null int64
job 4521 non-null object
marital 4521 non-null object
education 4521 non-null object
default 4521 non-null object
balance 4521 non-null int64
housing-loan 4521 non-null object
personal-loan 4521 non-null object
current-campaign 4521 non-null int64
previous-campaign 4521 non-null int64
subscribed 4521 non-null object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB
['age',
'job',
'marital',
'education',
'default',
'balance',
'housing-loan',
'personal-loan',
'current-campaign',
'previous-campaign']
significant_vars
['const',
'current-campaign',
'previous-campaign',
'job_retired',
'marital_married',
'education_tertiary',
'housing-loan_yes',
'personal-loan_yes']
X_features = ['current-campaign',
'previous-campaign',
'job_retired',
'marital_married',
'education_tertiary',
'housing-loan_yes',
'personal-loan_yes']
logit_model_2.summary2()
sorted_predict_df = y_pred_df[['predicted_prob',
'actual']].sort_values( 'predicted_prob',
ascending = False )
def get_deciles( df ):
df['decile'] = 1
idx = 0
df['decile'] = df['decile'] + 1
return df
deciles_predict_df[0:10]
3682 0.864769 0 1
97 0.828031 0 1
3426 0.706809 0 1
1312 0.642337 1 1
3930 0.631032 1 1
4397 0.619146 0 1
2070 0.609129 0 1
3023 0.573199 0 1
4080 0.572364 0 1
804 0.559350 0 1
gain_lift_df = pd.DataFrame(
deciles_predict_df.groupby(
'decile')['actual'].sum() ).reset_index()
gain_lift_df.columns = ['decile', 'gain']
gain_lift_df
0 1 125 23.992322
1 2 83 39.923225
2 3 73 53.934741
3 4 53 64.107486
4 5 31 70.057582
5 6 46 78.886756
6 7 37 85.988484
7 8 28 91.362764
8 9 25 96.161228
9 10 20 100.000000
plt.show()
Calculating Lift
gain_lift_df['lift'] = ( gain_lift_df.gain_percentage
/ ( gain_lift_df.decile * 10) )
1 2 83 39.923225 1.996161
2 3 73 53.934741 1.797825
3 4 53 64.107486 1.602687
4 5 31 70.057582 1.401152
5 6 46 78.886756 1.314779
6 7 37 85.988484 1.228407
7 8 28 91.362764 1.142035
8 9 25 96.161228 1.068458
9 10 20 100.000000 1.000000
Y = credit_df.status
X = encoded_credit_df
0.5835743204164258
0.4189
X_test.shape
(300, 29)
import math
0.88
0.5763972869236027
clf_tree = DecisionTreeClassifier()
clf = GridSearchCV(clf_tree,
tuned_parameters,
cv=10,
scoring='roc_auc')
clf.fit(X_train, y_train )
/Users/manaranjan/anaconda/lib/python3.5/site-packages/sklearn/model
_selection/_search.py:841: DeprecationWarning: The default of the `i
id` parameter will change from True to False in version 0.22 and wil
l be removed in 0.24. This will change numeric results when test-set
sizes are unequal.
DeprecationWarning)
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None, criterion
='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_stat
e=None,
splitter='best'),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'max_depth': range(2, 10), 'criterion': ['gini',
'entropy']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.6824299319727891
clf.best_params_