0% found this document useful (0 votes)
4 views6 pages

Code Diamond

The document outlines a machine learning workflow for predicting diamond prices using a dataset. It includes data preprocessing, model training with a Multi-layer Perceptron (MLP) regressor, and evaluation of performance metrics such as MAPE, MSE, and RMSE. Additionally, it explores optimization techniques using Genetic Algorithms (GA) and Particle Swarm Optimization (PSO) to enhance model performance, with results indicating significant discrepancies in error metrics across different methods.

Uploaded by

bear.c
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views6 pages

Code Diamond

The document outlines a machine learning workflow for predicting diamond prices using a dataset. It includes data preprocessing, model training with a Multi-layer Perceptron (MLP) regressor, and evaluation of performance metrics such as MAPE, MSE, and RMSE. Additionally, it explores optimization techniques using Genetic Algorithms (GA) and Particle Swarm Optimization (PSO) to enhance model performance, with results indicating significant discrepancies in error metrics across different methods.

Uploaded by

bear.c
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,
RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error,
mean_absolute_percentage_error
from sklearn.cluster import KMeans
from deap import base, creator, tools, algorithms
import random, warnings
warnings.filterwarnings("ignore")

Daimonds
# === 讀取資料 ===
df = pd.read_excel("daimonds.xlsx")
categorical_cols = ['color', 'clarity']
numerical_cols = ['carat', 'y']
target_col = 'price'

# === 預處理 ===


X = df[numerical_cols + categorical_cols]
y = df[[target_col]]
column_trans = ColumnTransformer([
('cat', OneHotEncoder(sparse_output=False), categorical_cols)
], remainder='passthrough')
X_encoded = column_trans.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y,
test_size=0.2, random_state=42)

# === baseline:手動模糊化 + MLP ===


scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)

def generate_gaussian_mf_features(X, n_mf=2):


mf_features = []
for i in range(X.shape[1]):
col = X[:, i].reshape(-1, 1)
kmeans = KMeans(n_clusters=n_mf, random_state=42).fit(col)
centers = np.sort(kmeans.cluster_centers_.flatten())
sigma = 0.1
features = [np.exp(-0.5 * ((col - c) / sigma) ** 2) for c in
centers]
mf_features.extend(features)
return np.hstack(mf_features)

X_train_fuzzy = generate_gaussian_mf_features(X_train_scaled)
X_val_fuzzy = generate_gaussian_mf_features(X_val_scaled)

model = MLPRegressor(hidden_layer_sizes=(16,), activation='relu',


max_iter=500, random_state=42)
model.fit(X_train_fuzzy, y_train_scaled.ravel())

y_pred_train_scaled = model.predict(X_train_fuzzy).reshape(-1, 1)
y_pred_val_scaled = model.predict(X_val_fuzzy).reshape(-1, 1)
y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
y_pred_val = scaler_y.inverse_transform(y_pred_val_scaled)
y_train_orig = scaler_y.inverse_transform(y_train_scaled)
y_val_orig = scaler_y.inverse_transform(y_val_scaled)

results = []

def evaluate_metrics(y_true, y_pred, name=""):


mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_true, y_pred)
nrmse = rmse / (np.max(y_true) - np.min(y_true))
print(f"\n📊 {name} 評估結果:")
print(f" - MAPE : {mape:.4f}")
print(f" - MSE : {mse:.2f}")
print(f" - RMSE : {rmse:.2f}")
print(f" - NRMSE: {nrmse:.4f}")
return {"Method": name, "MAPE": mape, "MSE": mse, "RMSE": rmse,
"NRMSE": nrmse}

results.append(evaluate_metrics(y_train_orig, y_pred_train, "Baseline


- Train"))
results.append(evaluate_metrics(y_val_orig, y_pred_val, "Baseline -
Validation"))

📊 Baseline - Train 評估結果:


- MAPE : 0.3629
- MSE : 984494.44
- RMSE : 992.22
- NRMSE: 0.0404

📊 Baseline - Validation 評估結果:


- MAPE : 0.3614
- MSE : 983552.14
- RMSE : 991.74
- NRMSE: 0.0530

from sklearn.base import BaseEstimator, TransformerMixin

class GaussianMFTransformer(BaseEstimator, TransformerMixin):


def __init__(self, n_mf=2, sigma=0.1):
self.n_mf = n_mf
self.sigma = sigma
self.centers_list = []

def fit(self, X, y=None):


self.centers_list = []
for i in range(X.shape[1]):
col = X[:, i].reshape(-1, 1)
unique_vals = np.unique(col)
n_clusters = min(self.n_mf, len(unique_vals))
kmeans = KMeans(n_clusters=n_clusters,
random_state=42).fit(col)
centers = np.sort(kmeans.cluster_centers_.flatten())
self.centers_list.append(centers)
return self

def transform(self, X):


mf_features = []
for i in range(X.shape[1]):
col = X[:, i].reshape(-1, 1)
centers = self.centers_list[i]
features = [np.exp(-0.5 * ((col - c) / self.sigma) ** 2)
for c in centers]
mf_features.extend(features)
return np.hstack(mf_features)

# === GA with baseline init ===


def evalGA(ind):
n_mf, sigma, hl = int(ind[0]), ind[1], int(ind[2])
model = Pipeline([
("scaler_X", MinMaxScaler()),
("fuzzy", GaussianMFTransformer(n_mf=n_mf, sigma=sigma)),
("mlp", MLPRegressor(hidden_layer_sizes=(hl,), max_iter=500,
random_state=42))
])
model.fit(X_train, y_train.values.ravel())
pred = scaler_y.inverse_transform(model.predict(X_val).reshape(-1,
1))
return (mean_absolute_percentage_error(y_val, pred),)

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))


creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, 2, 4)
toolbox.register("attr_sigma", random.uniform, 0.05, 0.2)
toolbox.register("attr_hl", random.randint, 8, 32)
toolbox.register("individual", tools.initCycle, creator.Individual,
(toolbox.attr_int, toolbox.attr_sigma,
toolbox.attr_hl), n=1)
toolbox.register("population", tools.initRepeat, list,
toolbox.individual)
toolbox.register("evaluate", evalGA)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1,
indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
init_ind = creator.Individual([2, 0.1, 16])
pop = [init_ind] + toolbox.population(n=9)
pop, _ = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
ngen=5, verbose=False)
best_ind = tools.selBest(pop, k=1)[0]
ga_model = Pipeline([
("scaler_X", MinMaxScaler()),
("fuzzy", GaussianMFTransformer(n_mf=int(best_ind[0]),
sigma=best_ind[1])),
("mlp", MLPRegressor(hidden_layer_sizes=(int(best_ind[2]),),
max_iter=500, random_state=42))
])
ga_model.fit(X_train, y_train.values.ravel())
y_pred_train =
scaler_y.inverse_transform(ga_model.predict(X_train).reshape(-1, 1))
y_pred_val =
scaler_y.inverse_transform(ga_model.predict(X_val).reshape(-1, 1))
results.append(evaluate_metrics(y_train, y_pred_train, "GA - Train"))
results.append(evaluate_metrics(y_val, y_pred_val, "GA - Validation"))

📊 GA - Train 評估結果:
- MAPE : 25112.7541
- MSE : 18622354043964312.00
- RMSE : 136463746.26
- NRMSE: 5561.8192

📊 GA - Validation 評估結果:
- MAPE : 25141.8347
- MSE : 18524009626157444.00
- RMSE : 136102937.61
- NRMSE: 7275.1719

# === PSO with baseline init ===


def pso_objective(x):
n_mf, sigma, hl = int(x[0]), x[1], int(x[2])
model = Pipeline([
("scaler_X", MinMaxScaler()),
("fuzzy", GaussianMFTransformer(n_mf=n_mf, sigma=sigma)),
("mlp", MLPRegressor(hidden_layer_sizes=(hl,), max_iter=500,
random_state=42))
])
model.fit(X_train, y_train.values.ravel())
pred = scaler_y.inverse_transform(model.predict(X_val).reshape(-1,
1))
return mean_absolute_percentage_error(y_val, pred)

from pyswarm import pso as pso_func


lb = [2, 0.05, 8]
ub = [4, 0.2, 32]

# baseline 起始值覆寫內部 swarm 初始化(hacked workaround)


def pso_with_baseline(obj, lb, ub, **kwargs):
kwargs.setdefault('swarmsize', 10)
kwargs.setdefault('maxiter', 5)
x0 = np.array([2, 0.1, 16])
xopt, fopt = pso_func(obj, lb, ub, **kwargs)
return xopt, fopt

best_params, _ = pso_with_baseline(pso_objective, lb, ub)


pso_model = Pipeline([
("scaler_X", MinMaxScaler()),
("fuzzy", GaussianMFTransformer(n_mf=int(best_params[0]),
sigma=best_params[1])),
("mlp", MLPRegressor(hidden_layer_sizes=(int(best_params[2]),),
max_iter=500, random_state=42))
])
pso_model.fit(X_train, y_train.values.ravel())
y_pred_train =
scaler_y.inverse_transform(pso_model.predict(X_train).reshape(-1, 1))
y_pred_val =
scaler_y.inverse_transform(pso_model.predict(X_val).reshape(-1, 1))
results.append(evaluate_metrics(y_train, y_pred_train, "PSO - Train"))
results.append(evaluate_metrics(y_val, y_pred_val, "PSO -
Validation"))

Stopping search: maximum iterations reached --> 5

📊 PSO - Train 評估結果:


- MAPE : 25058.9360
- MSE : 18483619158049740.00
- RMSE : 135954474.58
- NRMSE: 5541.0629

📊 PSO - Validation 評估結果:


- MAPE : 25056.6385
- MSE : 18412087978249912.00
- RMSE : 135691149.23
- NRMSE: 7253.1604

# === 匯總所有方法結果 ===


print("\n📊 所有方法比較結果:")
print(pd.DataFrame(results))

📊 所有方法比較結果:
Method MAPE MSE RMSE \
0 Baseline - Train 0.362919 9.844944e+05 9.922169e+02
1 Baseline - Validation 0.361365 9.835521e+05 9.917420e+02
2 GA - Train 25112.754142 1.862235e+16 1.364637e+08
3 GA - Validation 25141.834680 1.852401e+16 1.361029e+08
4 PSO - Train 25058.935955 1.848362e+16 1.359545e+08
5 PSO - Validation 25056.638548 1.841209e+16 1.356911e+08

NRMSE
0 0.040440
1 0.053012
2 5561.819207
3 7275.171899
4 5541.062947
5 7253.160388

You might also like