vertopal.com_project
vertopal.com_project
# import data
wine = pd.read_csv(
'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-
quality/winequality-white.csv', delimiter=";")
EDA
column names
wine.columns
6 2198
5 1457
7 880
8 175
4 163
3 20
9 5
Name: quality, dtype: int64
Data Transformation
We want to transfer the score(num) to low-medium-high quality level(categorical) by:
<matplotlib.axes._subplots.AxesSubplot at 0x10eb7c6d8>
<matplotlib.axes._subplots.AxesSubplot at 0x10eb93f28>
Set up model matrix
quality = wine["quality"].values
category = []
for num in quality:
if num < 5:
category.append("Low")
elif num > 6:
category.append("High")
else:
category.append("Midium")
category = pd.DataFrame(data=category, columns=["category"])
data = pd.concat([wine, category], axis=1)
data.drop(columns="quality", axis=1, inplace=True)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True,
class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn',
n_jobs=None,
oob_score=True, random_state=2018, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'n_estimators': [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
rfc_gs.best_score_
0.8363961204696274
SVM
# strtifiedKFold
skf = StratifiedKFold(n_splits=4)
GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None,
shuffle=False),
error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None,
coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'clf__C': [0.1, 0.3, 1, 3, 10], 'clf__gamma': [0.1,
0.3, 1, 3, 10], 'clf__kernel': ['rbf', 'sigmoid']},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
print(gs_svm.best_score_)
0.8083205717202654
# @hidden_cell
# relabel back : 0 means good, 1 for low, 2 for medium for better
visualization
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_svm_re = list(pred_svm)
for i in range(len(pred_svm_re)):
if pred_svm_re[i] == 0:
pred_svm_re[i] = "good"
if pred_svm_re[i] == 1:
pred_svm_re[i] = "low"
if pred_svm_re[i] == 2:
pred_svm_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_svm_re, name='Predicted')
svm_confusion = pd.crosstab(y_actu, y_pred)
svm_confusion
Decision Tree
clf = Pipeline([
('scl', StandardScaler()),
('pca', PCA(random_state=42)),
('clf', DecisionTreeClassifier(random_state=42))])
param_grid =\
[{'clf__class_weight': class_weight,
'clf__criterion': criterion,
'clf__splitter': splitter,
'clf__max_depth': max_depth,
'clf__min_samples_leaf': min_samples_leaf
}]
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('scl', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=42,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
DecisionTreeClassifier(class_weight=None, criterion='gini',
max_depth=None,
ma... min_weight_fraction_leaf=0.0, presort=False,
random_state=42,
splitter='best'))]),
fit_params=None, iid='warn', n_jobs=-1,
param_grid=[{'clf__class_weight': ['balanced', None],
'clf__criterion': ['gini', 'entropy'], 'clf__splitter': ['best'],
'clf__max_depth': [8, 9, 10, 11, 15, 20, 25], 'clf__min_samples_leaf':
[2, 3, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=1)
print(gs_dt.best_score_)
0.7595712098009189
KNN
# knn with pca grid search
pipe_knn_pca = Pipeline([('pca', PCA()),
('clf', KNeighborsClassifier())])
grid_params_knn_pca = \
dict(pca__n_components=range(1, 11), clf__n_neighbors=range(1,
10))
gs_knn_pca = GridSearchCV(estimator=pipe_knn_pca,
param_grid=grid_params_knn_pca,
scoring='accuracy')
gs_knn_pca.fit(X_train, y_train)
/Users/tianqiluke/anaconda3/lib/python3.6/site-packages/sklearn/
model_selection/_split.py:2053: FutureWarning: You should specify a
value for 'cv' instead of relying on the default value. The default
value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'pca__n_components': range(1, 11),
'clf__n_neighbors': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
print(gs_knn_pca.best_score_)
0.7391526288922919
# @hidden_cell
# relabel back : 0 means good, 1 for low, 2 for medium for better
visualization
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_rfc_re = list(pred_rfc)
for i in range(len(pred_rfc_re)):
if pred_rfc_re[i] == 0:
pred_rfc_re[i] = "good"
if pred_rfc_re[i] == 1:
pred_rfc_re[i] = "low"
if pred_rfc_re[i] == 2:
pred_rfc_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_rfc_re, name='Predicted')
rfc_confusion = pd.crosstab(y_actu, y_pred)
rfc_confusion
Resampling
# under sample "2" ;
# over sample "1", "0"
smt = ClusterCentroids(ratio={2: 1500})
X_sm, y_sm = smt.fit_sample(X_train, y_train)
smt2 = SMOTE(ratio={0: 1500, 1: 1500})
X_sm2, y_sm2 = smt2.fit_sample(X_sm, y_sm)
Random Forest
rfc_rs = RandomForestClassifier(random_state=2018)
param_dist = {"n_estimators": [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]}
rfc_gs_rs = GridSearchCV(rfc_rs, param_grid=param_dist,
scoring='accuracy', cv=5)
rfc_gs_rs.fit(X_sm2, y_sm2)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True,
class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn',
n_jobs=None,
oob_score=False, random_state=2018, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'n_estimators': [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
rfc_gs_rs.best_score_
0.862
importances = rfc_gs_rs.best_estimator_.feature_importances_
wine.columns[:-1]
columns=['importance']).sort_values('importance',
ascending=False)
feature_importances.plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x1a2ef1d7f0>
SVM_rs
# grid search after resample
pipe_svm = Pipeline([('clf', svm.SVC())])
grid_params = dict(clf__C=[0.1, 0.3, 1, 3, 10],
clf__gamma=[0.1, 0.3, 1, 3, 10],
clf__kernel=['rbf', 'sigmoid'])
gs_svm_sm = GridSearchCV(estimator=pipe_svm,
param_grid=grid_params,
scoring='accuracy',
cv=skf)
gs_svm_sm.fit(X_sm2, y_sm2)
GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None,
shuffle=False),
error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None,
coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'clf__C': [0.1, 0.3, 1, 3, 10], 'clf__gamma': [0.1,
0.3, 1, 3, 10], 'clf__kernel': ['rbf', 'sigmoid']},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
gs_svm_sm.best_score_
0.822
DT_rs
clf = Pipeline([
('scl', StandardScaler()),
('pca', PCA(random_state=42)),
('clf', DecisionTreeClassifier(random_state=42))])
param_grid =\
[{'clf__class_weight': class_weight,
'clf__criterion': criterion,
'clf__splitter': splitter,
'clf__max_depth': max_depth,
'clf__min_samples_leaf': min_samples_leaf
}]
gs_dt_rs = GridSearchCV(estimator=clf,
param_grid=param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-1)
gs_dt_rs.fit(X_sm2, y_sm2)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('scl', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=42,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
DecisionTreeClassifier(class_weight=None, criterion='gini',
max_depth=None,
ma... min_weight_fraction_leaf=0.0, presort=False,
random_state=42,
splitter='best'))]),
fit_params=None, iid='warn', n_jobs=-1,
param_grid=[{'clf__class_weight': ['balanced', None],
'clf__criterion': ['gini', 'entropy'], 'clf__splitter': ['best'],
'clf__max_depth': [8, 9, 10, 11, 15, 20, 25], 'clf__min_samples_leaf':
[2, 3, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=1)
gs_dt_rs.best_score_
0.7455555555555555
KNN_rs
# knn with pca grid search
pipe_knn_pca = Pipeline([('pca', PCA()),
('clf', KNeighborsClassifier())])
grid_params_knn_pca = dict(pca__n_components=range(1, 11),
clf__n_neighbors=range(1, 10))
gs_knn_pca_rs = GridSearchCV(estimator=pipe_knn_pca,
param_grid=grid_params_knn_pca,
scoring='accuracy')
gs_knn_pca_rs.fit(X_sm2, y_sm2)
/Users/tianqiluke/anaconda3/lib/python3.6/site-packages/sklearn/
model_selection/_split.py:2053: FutureWarning: You should specify a
value for 'cv' instead of relying on the default value. The default
value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'pca__n_components': range(1, 11),
'clf__n_neighbors': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
gs_knn_pca_rs.best_score_
0.7715555555555556
" RF gives best result, but SVM also good as well, check both model under resample. "
" check their performance on test data "
confusion matrix
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
sm_pred_re = list(svm_rs_predicted)
for i in range(len(svm_rs_predicted)):
if sm_pred_re[i] == 0:
sm_pred_re[i] = "good"
if sm_pred_re[i] == 1:
sm_pred_re[i] = "low"
if sm_pred_re[i] == 2:
sm_pred_re[i] = "medium"
svm_rs_confusion = pd.crosstab(
pd.Series(y_test_re, name='actual'), pd.Series(sm_pred_re,
name='predicted'))
svm_rs_confusion
print(classification_report(y_test, pred_rfc_rs))
print("The RF model(resampled) accuracy on test is %s" %
accuracy_score(y_test, pred_rfc_rs))
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_rfc_re = list(pred_rfc_rs)
for i in range(len(pred_rfc_re)):
if pred_rfc_re[i] == 0:
pred_rfc_re[i] = "good"
if pred_rfc_re[i] == 1:
pred_rfc_re[i] = "low"
if pred_rfc_re[i] == 2:
pred_rfc_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_rfc_re, name='Predicted')
rfc_rsconfusion = pd.crosstab(y_actu, y_pred)
confusion matrix
rfc_rs_confusion
svm(resampled) has higher accuracy on test compare to random forest(78 to 73), but
RF(resampled) performs better on good and low. it's trade off to use which model on future,
depends on business purpose.