Advanced ML PDF
Advanced ML PDF
X = ipl_auction_encoded_df
Y = ipl_auction_df['SOLD PRICE']
linreg.coef_
53,
0.24880095, 0.09546057, 0.16428731, 0.26400753, -0.082533
41,
-0.28643889, -0.26842214, -0.21910913, -0.02622351, 0.248178
98,
0.18760332, 0.10776084, 0.04737488, 0.05191335, 0.012352
45,
0.00547115, -0.03124706, 0.08530192, 0.01790803, -0.050774
54,
0.18745577])
Text(0,0.5,'Features')
get_train_test_rmse( linreg )
train: 0.679 test: 0.749
# Applying alpha = 1 and running the algorithms for maximum of 500 iterations
ridge = Ridge(alpha = 1, max_iter = 500)
ridge.fit( X_train, y_train )
get_train_test_rmse( ridge )
train: 0.68 test: 0.724
# Applying alpha = 1 and running the algorithms for maximum of 500 iterations
lasso = Lasso(alpha = 0.01, max_iter = 500)
lasso.fit( X_train, y_train )
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=500,
normalize=False, positive=False, precompute=False, random_state=N
one,
selection='cyclic', tol=0.0001, warm_start=False)
get_train_test_rmse( lasso )
train: 0.688 test: 0.698
coef columns
1 -0.0 T-WKTS
3 -0.0 ODI-SR-B
13 -0.0 AVE-BL
0.01/1.01
0.009900990099009901
bank_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
age 4521 non-null int64
job 4521 non-null object
marital 4521 non-null object
education 4521 non-null object
default 4521 non-null object
balance 4521 non-null int64
housing-loan 4521 non-null object
personal-loan 4521 non-null object
current-campaign 4521 non-null int64
previous-campaign 4521 non-null int64
subscribed 4521 non-null object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB
bank_df.subscribed.value_counts()
no 4000
yes 521
Name: subscribed, dtype: int64
## Importing resample from *sklearn.utils* package.
from sklearn.utils import resample
## get_dummies() will convert all the columns with data type as objects
encoded_bank_df = pd.get_dummies( new_bank_df[X_features], drop_first = True )
X = encoded_bank_df
pred_y = logit.predict(test_X)
## Defining the matrix to draw the confusion metrix from actual and predicted cl
ass labels
def draw_cm( actual, predicted ):
# Invoking confusion_matrix from metric package. The matrix will oriented as
[1,0] i.e.
# the classes with label 1 will be reprensted the first row and 0 as second
row
cm = metrics.confusion_matrix( actual, predicted, [1,0] )
## Confustion will be plotted as heatmap for better visualization
## The lables are configured to better interpretation from the plot
sn.heatmap(cm, annot=True, fmt='.2f',
xticklabels = ["Subscribed", "Not Subscribed"] ,
yticklabels = ["Subscribed", "Not Subscribed"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
0 1
0 0.704479 0.295521
1 0.853664 0.146336
2 0.666963 0.333037
3 0.588329 0.411671
4 0.707982 0.292018
test_results_df.head(5)
0 1321 0 0.295521
1 3677 0 0.146336
2 1680 1 0.333037
3 821 0 0.411671
4 921 0 0.292018
# Passing actual class labels and the predicted probability values to compute RO
C AUC score.
auc_score = metrics.roc_auc_score( test_results_df.actual, test_results_df.chd_1
)
round( float( auc_score ), 2 )
0.69
## Importing GridSearchCV
from sklearn.model_selection import GridSearchCV
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=3
0, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'n_neighbors': range(5, 10), 'metric': ['canberr
a', 'euclidean', 'minkowski']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.8368537419503068
clf.best_params_
{'metric': 'canberra', 'n_neighbors': 5}
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight
=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=Non
e,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn', n_job
s=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'n_estimators': [10, 20], 'max_depth': [10, 15],
'max_features': ['sqrt', 'auto']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.9399595384858543
clf.best_params_
{'max_depth': 15, 'max_features': 'auto', 'n_estimators': 20}
## Sorting the features based on their importances with most important feature a
t top.
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize=(8, 6))
# plot the values
sn.barplot( y = 'feature', x = 'importance', data = feature_rank );
feature_rank['cumsum'] = feature_rank.importance.cumsum() * 100
feature_rank.head(10)
6.5.6 Boosting
6.5.6.1 Adaboost
## Initializing Gradient Boosting with 500 estimators and max depth as 10.
gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
print( cv_scores )
print( "Mean Accuracy: ", np.mean(cv_scores), " with standard deviation of: ",
np.std(cv_scores))
[0.98241686 0.98105851 0.98084469 0.9585199 0.95482216 0.96667006
0.95342452 0.97368689 0.95937357 0.98174607]
Mean Accuracy: 0.969256322542174 with standard deviation of: 0.01
1406249012935668
gboost_clf.fit(train_X, train_y )
pred_y = gboost_clf.predict( test_X )
draw_cm( test_y, pred_y )
## Sorting the features based on their importances with most important feature a
t top.
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize=(8, 6))
# plot the values
sn.barplot( y = 'feature', x = 'importance', data = feature_rank );