0% found this document useful (0 votes)
16 views17 pages

Laboratoare SBC

The document discusses various machine learning algorithms including KNN, SVM, Naive Bayes, decision trees, random forests, XGBoost and clustering algorithms like KMeans and DBSCAN. It provides code examples for implementing these algorithms on the iris dataset and evaluating their performance.

Uploaded by

Denisa Alina
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views17 pages

Laboratoare SBC

The document discusses various machine learning algorithms including KNN, SVM, Naive Bayes, decision trees, random forests, XGBoost and clustering algorithms like KMeans and DBSCAN. It provides code examples for implementing these algorithms on the iris dataset and evaluating their performance.

Uploaded by

Denisa Alina
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 17

Curs 05.

03
KNN
from sklearn.datasets import load_iris
import numpy as np
from sklearn.model_selection import train_test_split
# flori = load_iris()
X = flori.data
y = flori.target #etichetele
#formam un tuplu cu setul de antrenament
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
len(X_train) #nr de etichete
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# tunning pentru k

k_range = range(1, 26) # 25 de valori de la 1 la 25


scoruri = {} #dictionar de tuple cu key =
scoruri_lista = []

for k in k_range:
knn = KNeighborsClassifier(n_neighbors = k) #kNeighboursClassifier e un
constructor
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scoruri[k] = (y_test, y_pred)
scoruri_lista.append(metrics.accuracy_source(y_test, y_pred)) #append
pune in capatul listei ultima valoare

#aleg k = 5
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X, y)

# o floare noua
X_nou = [[3,4,5,2], [5,4,2,2]]
y_predict = knn.predict(X_nou)
y_predict
X_train

Lab 11.03
SVM
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# seaborn si matplotlib pt prezentare grafica

from sklearn.model_selection import train_test_split


from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

#incarcam datele
floricele = pd.read_csv('iris.csv')
print(floricele.head())
print(floricele.info())

#EDA
sns.countplot(x = 'variety', data = floricele)

sns.pairplot(floricele, hue = 'variety')

floricele.describe()
# 50% reprezinta mediana, cea care imparte setul de date in 2 jumatati egale
# 25% cuartilla

X = floricele.drop('variety', axis = 1)
X.head()

y = floricele['variety']
y.tail()

# aplic LabelEncoder
label_encoder = LabelEncoder()
type(label_encoder)
y_encoded = label_encoder.fit_transform(y)
y_encoded

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.3,


random_state = 42)
X_train

# svm liniar
model = SVC(kernel = 'linear')
# model e o instanta/ un obiect al clasei SVC

model.fit(X_train, y_train)

predictii = model.predict(X_test)

#matricea de confuzie
mat_conf = confusion_matrix(y_test, predictii)
print(mat_conf)

print(classification_report(y_test, predictii))

Lab 25.03 – continuare lab 11.03


#model SVM RBF
model_rbf = SVC(kernel = 'rbf', C = 1.0, gamma = 'scale')
model_rbf.fit(X_train, y_train)

# apoi de face predictia, matricea de confuzii ca la modelul de dinainte


~~~~ se termina continuarea cursului 11.03 si incepe lab 25.03

import pandas as pd
flori = pd.read_csv('iris.csv')
flori.head()

#encoding la campul de etichetare ('variety')


from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
flori['variety'] = label_encoder.fit_transform(flori['variety'])
flori.head()

from sklearn.model_selection import train_test_split

X = flori.drop('variety', axis = 1) #adica pe linii


y = flori['variety'] # y e compus doar din campul cu etichete
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
#model gausian bayse are caracteristic probilitatile
#POATE CADEA LA EXAMEN + formula - probabilitate destul de mare
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score


y_pred = model.predict(X_test)
acuratete = accuracy_score(y_test, y_pred)
print(f"Acuratetea este {acuratete}")

#normalizare - de ex cand sunt unitati de masura diferita


from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X_train_scalat = scaler.fit_transform(X_train)
X_test_scalat = scaler.transform(X_test)
model.fit(X_train_scalat,X_test_scalat)

Curs 26.03

DECISION TREE
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

floricele = pd.read_csv('iris.csv')
X = floricele.drop('variety', axis = 1)
y = floricele['variety']

# sarim peste LabelEncoding - il face sklearn automat


model_entropy = DecisionTreeClassifier(criterion = 'entropy', random_state =
42)
scoruri_entropie = cross_val_score(model_entropy, X, y, cv = 5) # 5 repr. nr. de
"cutiute" in care se aseaza datele
print(f'Scorurile pentru IG(Entropy): {scoruri_entropie}')

print(f'media scorurilor prntru IG: {sum(scoruri_entropie) /


len(scoruri_entropie)}')

print(f'media scorurilor prntru IG: {np.mean(scoruri_entropie)}')

model_gini = DecisionTreeClassifier(criterion = 'gini', random_state = 42)


scoruri_gini = cross_val_score(model_gini, X, y, cv = 4) # 5 repr. nr. de "cutiute"
in care se aseaza datele
print(f'Scorurile pentru IG(gini): {scoruri_gini}')

print(f'media scorurilor prntru IG: {np.mean(scoruri_gini)}')


Lab 04.03
import pandas as pd
from sklearn.datasets import load_iris

irisi = load_iris()
irisi.feature_names

df = pd.DataFrame(irisi.data, columns = irisi.feature_names)


df.head()

df['target'] = irisi.target
irisi.target_names

df[df.target == 1].head() #o-setosa, 1-versicolor, 2-virginica

df['nume_floare'] = df.target.apply(lambda x: irisi.target_names[x])


df.head()

df.tail()

from matplotlib import pyplot as plt


df0 = df[df.target == 0]
df0.head()
Curs 02.04
LDA
#LDA ca algoritm de reducere dimensionala
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris

flori = load_iris()
type(flori)

X = flori.data
y = flori.target

lda = LinearDiscriminantAnalysis(n_components = 2)

X_lda = lda.fit_transform(X, y)

import matplotlib.pyplot as plt


plt.figure(figsize = (8, 6))
plt.scatter(X_lda[:, 0], X_lda[:, 1], c = y, cmap = 'viridis', edgecolor = 'k')
plt.xlabel('Componenta LDA 1')
plt.ylabel('Componenta LDA 2')
plt.title('Reprezentarea datelor cu irisi in LDA cu 2 componente')
plt.colorbar(label = 'Clase')
print('multumim doamna profesoara')

# tema : folosind datele reduse dimensional faceti un sot - base, fara sot -
arbore de decizie
~~ nuj ce am facut aici :
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_lda[:,0], X_lda[:,1], test_size


= 0.2, random_state = 42)

model = GaussianNB()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score


y_pred = model.predict(X_test)
acuratete = accuracy_score(y_test, y_pred)
print(f"Acuratetea este {acuratete}")

Lab 08.04
RANDOMFOREST
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

floricele = pd.read_csv('iris.csv')
# floricele v a deveni un pandas data frame
X = floricele.drop('variety', axis = 1)
y = floricele['variety']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)

rf_clasificator = RandomForestClassifier(n_estimators = 100, random_state =


42)
# n_estimators - nr de arbori de decizie

rf_clasificator.fit(X_train, y_train)

predictii = rf_clasificator.predict(X_test)

print('Acuratetea : ', accuracy_score(y_test, predictii))

#RF cu GridSearchView
from sklearn.model_selection import GridSearchCV #face cross validation
automat
parametrii_grid = {
'n_estimators' : [50, 100, 150],
'max_depth' : [None, 5, 10],
'min_samples_split' : [2, 5, 10],
'min_samples_leaf' : [1, 2, 4],
'max_features' : [None, 'sqrt'],
}

rf_clas_grid = RandomForestClassifier(random_state = 42)


grid_search = GridSearchCV(rf_clas_grid , parametrii_grid, cv = 5)

grid_search.fit(X, y)

print("Cei mai buni hiperparametri: ", grid_search.best_params_)

Lab 22.04
XGBoost
# un exemplu cu XGBoost Classifier
from xgboost import XGBClassifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib #pentru salvarea (serializarea) modelului
from sklearn import preprocessing

floricele = pd.read_csv('iris.csv')
X = floricele.drop('variety', axis = 1)
y = floricele['variety']
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)

clasificatori = {
'Logistic Regression': LogisticRegression(),
'SVM' : SVC(),
'KNN' : KNeighborsClassifier(),
'Decision Tree' : DecisionTreeClassifier(),
'RandomForest' : RandomForestClassifier(),
}

#dictionar pt memorarea modelelor antrenate


modele_antrenate = {}
for nume_clasif, clasif in clasificatori.items():
model = XGClassifier(base_estimators = clasif)
model.fit(X_train, y_train)
#salvare model in dictionar
modele_antrenate[nume.clasif] = model
y_pred = model.predict(X_test)
acuratetea = accuracy_score(y_test, y_pred)
print(f'{nume_clasif} acuratetea: {accuracy_score}')

# nu functioneaza decat cu decision tree deoarece XGB nu mai are


base_estimators

Curs 23.04
KMeans
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

floricele = pd.read_csv('iris.csv')
# X = floricele.drop('variety', axis = 1) sau asa :
X = floricele[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
y_umane = floricele['variety']

#standardizare
scaler = StandardScaler()
X_scalat = scaler.fit_transform(X)
floricele_scalat = pd.DataFrame(X_scalat, columns = X.columns)

inertia = []
for nr_c in range(1,11):
kmeans = KMeans(n_clusters = nr_c, random_state = 42)
kmeans.fit(X_scalat)
inertia.append(kmeans.inertia_)

plt.figure(figsize = (10,6))
plt.plot(range(1,11), inertia, marker = 'o')
plt.title('Metoda cotului pt KMeans')
plt.xlabel('nr. de clustere')
plt.ylabel('inertia')
plt.xticks(range(1,11))
plt.show()

kmeans = KMeans(n_clusters = 3, random_state = 42)


kmeans_labels = kmeans.fit_predict(X_scalat)

# ...
Curs 29.04
DBScan
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt

floricele = pd.read_csv('iris.csv')
X = floricele[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
y_adev = floricele['variety']

label_encoder = LabelEncoder()
y_adev_num = label_encoder.fit_transform(y_adev)

# standardizare = le fortam sa semene cu o gausiana


scaler = StandardScaler()
X_scalat = scaler.fit_transform(X)
floricele_scalat = pd.DataFrame(X_scalat, columns = X.columns)

floricele_scalat.head()

dbscan = DBSCAN(eps = 0.5, min_samples = 5)


# raza de 0.5, nr min de vecini pt a fi centroid = 5
dbscan_etichete = dbscan.fit_predict(X_scalat)

dbscan_etichete
# -1 reprezinta noise

plt.figure(figsize = (10,6))
plt.scatter(X_scalat[:,0], X_scalat[:,1], c = dbscan_etichete, cmap = 'viridis')
plt.title('Clustering cu DBSCAN')
plt.xlabel('Sepal Length (scalat)')
plt.ylabel('Sepal width (scalat)')

# evaluare
from sklearn.metrics import silhouette_score, calinski_harabasz_score,
adjusted_rand_score
dbscan_non_noise = (dbscan_etichete != 1)

dbscan_silhouette = silhouette_score(X_scalat[dbscan_non_noise],
dbscan_etichete[dbscan_non_noise])

dbscan_silhouette

You might also like