Classification Problems
Classification Problems
credit_df.iloc[0:5,1:7]
credit_df.status.value_counts()
0 700
1 300
Name: status, dtype: int64
encoded_credit_df[['checkin_acc_A12',
'checkin_acc_A13',
'checkin_acc_A14']].head(5)
0 0 0 0
1 1 0 0
2 0 0 1
3 0 0 0
4 0 0 0
import statsmodels.api as sm
Y = credit_df.status
X = sm.add_constant( encoded_credit_df )
import statsmodels.api as sm
def get_significant_vars( lm ):
var_p_vals_df = pd.DataFrame( lm.pvalues )
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals', 'vars']
return list( var_p_vals_df[var_p_vals_df.pvals <= 0.05]['vars'] )
significant_vars
['duration',
'amount',
'inst_rate',
'age',
'checkin_acc_A13',
'checkin_acc_A14',
'credit_history_A34',
'savings_acc_A65']
actual predicted_prob
557 1 0.080493
798 0 0.076653
977 0 0.345979
136 0 0.249919
575 0 0.062264
544 0 0.040768
332 1 0.833093
917 1 0.370667
678 0 0.388392
363 0 0.088952
y_pred_df['predicted'] = y_pred_df.predicted_prob.map(
lambda x: 1 if x > 0.5 else 0)
557 1 0.080493 0
798 0 0.076653 0
977 0 0.345979 0
136 0 0.249919 0
575 0 0.062264 0
544 0 0.040768 0
332 1 0.833093 1
917 1 0.370667 0
678 0 0.388392 0
363 0 0.088952 0
draw_cm( y_pred_df.actual,
y_pred_df.predicted )
draw_cm( y_pred_df.actual,
y_pred_df.predicted_new)
print(metrics.classification_report( y_pred_df.actual,
y_pred_df.predicted_new ))
idx = 0
prob cost
4 0.14 150.0
12 0.22 153.0
2 0.12 154.0
10 0.20 154.0
9 0.19 156.0
y_pred_df['predicted_using_cost'] = y_pred_df.predicted_prob.map(
lambda x: 1 if x > 0.14 else 0)
draw_cm( y_pred_df.actual,
y_pred_df.predicted_using_cost )
bank_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
age 4521 non-null int64
job 4521 non-null object
marital 4521 non-null object
education 4521 non-null object
default 4521 non-null object
balance 4521 non-null int64
housing-loan 4521 non-null object
personal-loan 4521 non-null object
current-campaign 4521 non-null int64
previous-campaign 4521 non-null int64
subscribed 4521 non-null object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB
significant_vars
['const',
'current-campaign',
'previous-campaign',
'job_retired',
'marital_married',
'education_tertiary',
'housing-loan_yes',
'personal-loan_yes']
X_features = ['current-campaign',
'previous-campaign',
'job_retired',
'marital_married',
'education_tertiary',
'housing-loan_yes',
'personal-loan_yes']
logit_model_2.summary2()
sorted_predict_df = y_pred_df[['predicted_prob',
'actual']].sort_values( 'predicted_prob',
ascending = False )
def get_deciles( df ):
df['decile'] = 1
idx = 0
df['decile'] = df['decile'] + 1
return df
deciles_predict_df[0:10]
3682 0.864769 0 1
97 0.828031 0 1
3426 0.706809 0 1
1312 0.642337 1 1
3930 0.631032 1 1
4397 0.619146 0 1
2070 0.609129 0 1
3023 0.573199 0 1
4080 0.572364 0 1
804 0.559350 0 1
gain_lift_df = pd.DataFrame(
deciles_predict_df.groupby(
'decile')['actual'].sum() ).reset_index()
gain_lift_df.columns = ['decile', 'gain']
gain_lift_df['gain_percentage'] = (100 *
gain_lift_df.gain.cumsum()/gain_lift_df.gain.sum())
gain_lift_df
0 1 125 23.992322
1 2 83 39.923225
2 3 73 53.934741
3 4 53 64.107486
4 5 31 70.057582
5 6 46 78.886756
6 7 37 85.988484
7 8 28 91.362764
8 9 25 96.161228
9 10 20 100.000000
plt.show()
Calculating Lift
gain_lift_df['lift'] = ( gain_lift_df.gain_percentage
/ ( gain_lift_df.decile * 10) )
gain_lift_df
1 2 83 39.923225 1.996161
2 3 73 53.934741 1.797825
3 4 53 64.107486 1.602687
4 5 31 70.057582 1.401152
5 6 46 78.886756 1.314779
6 7 37 85.988484 1.228407
7 8 28 91.362764 1.142035
8 9 25 96.161228 1.068458
9 10 20 100.000000 1.000000
Y = credit_df.status
X = encoded_credit_df
X_test.shape
(300, 29)
import math
clf_tree = DecisionTreeClassifier()
clf = GridSearchCV(clf_tree,
tuned_parameters,
cv=10,
scoring='roc_auc')
clf.fit(X_train, y_train )
/Users/manaranjan/anaconda/lib/python3.5/site-packages/sklearn/model
_selection/_search.py:841: DeprecationWarning: The default of the `i
id` parameter will change from True to False in version 0.22 and wil
l be removed in 0.24. This will change numeric results when test-set
sizes are unequal.
DeprecationWarning)
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None, criterion
='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_stat
e=None,
splitter='best'),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'max_depth': range(2, 10), 'criterion': ['gini',
'entropy']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.6824299319727891
clf.best_params_
{'criterion': 'gini', 'max_depth': 2}