0% found this document useful (0 votes)
21 views2 pages

Compare Data Preparation Methods For The Wine Classification Dataset

This Python code compares different data preparation methods for a wine classification machine learning model. It loads a wine dataset, defines a logistic regression model, and prepares six modeling pipelines using different normalization techniques like MinMaxScaler, StandardScaler, QuantileTransformer, and dimensionality reduction like PCA and SVD. It evaluates each pipeline using cross-validation and calculates the mean accuracy and standard deviation. The results are printed and plotted in a boxplot to compare the performance of the different data preparation methods.

Uploaded by

Nguyen Xuan Vi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views2 pages

Compare Data Preparation Methods For The Wine Classification Dataset

This Python code compares different data preparation methods for a wine classification machine learning model. It loads a wine dataset, defines a logistic regression model, and prepares six modeling pipelines using different normalization techniques like MinMaxScaler, StandardScaler, QuantileTransformer, and dimensionality reduction like PCA and SVD. It evaluates each pipeline using cross-validation and calculates the mean accuracy and standard deviation. The results are printed and plotted in a boxplot to compare the performance of the different data preparation methods.

Uploaded by

Nguyen Xuan Vi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

# compare data preparation methods for the wine classification dataset

from numpy import mean


from numpy import std
from pandas import read_csv
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from matplotlib import pyplot

# prepare the dataset


def load_dataset():
# load the dataset
url =
'https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv'
df = read_csv(url, header=None)
data = df.values
X, y = data[:, :-1], data[:, -1]
# minimally prepare dataset
X = X.astype('float')
y = LabelEncoder().fit_transform(y.astype('str'))
return X, y

# evaluate a model
def evaluate_model(X, y, model):
# define the cross-validation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv,
n_jobs=-1)
return scores

# get modeling pipelines to evaluate


def get_pipelines(model):
pipelines = list()
# normalize
p = Pipeline([('s',MinMaxScaler()), ('m',model)])
pipelines.append(('norm', p))
# standardize
p = Pipeline([('s',StandardScaler()), ('m',model)])
pipelines.append(('std', p))
# quantile
p = Pipeline([('s',QuantileTransformer(n_quantiles=100,
output_distribution='normal')), ('m',model)])
pipelines.append(('quan', p))
# discretize
p = Pipeline([('s',KBinsDiscretizer(n_bins=10, encode='ordinal',
strategy='uniform')), ('m',model)])
pipelines.append(('kbins', p))
# pca
p = Pipeline([('s',PCA(n_components=7)), ('m',model)])
pipelines.append(('pca', p))
# svd
p = Pipeline([('s',TruncatedSVD(n_components=7)), ('m',model)])
pipelines.append(('svd', p))
return pipelines

# get the dataset


X, y = load_dataset()
# define the model
model = LogisticRegression(solver='liblinear')
# get the modeling pipelines
pipelines = get_pipelines(model)
# evaluate each pipeline
results, names = list(), list()
for name, pipeline in pipelines:
# evaluate
scores = evaluate_model(X, y, pipeline)
# summarize
print('>%s: %.3f (%.3f)' % (name, mean(scores), std(scores)))
# store
results.append(scores)
names.append(name)
# plot the result
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

You might also like