Laboratoare SBC
Laboratoare SBC
03
KNN
from sklearn.datasets import load_iris
import numpy as np
from sklearn.model_selection import train_test_split
# flori = load_iris()
X = flori.data
y = flori.target #etichetele
#formam un tuplu cu setul de antrenament
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
len(X_train) #nr de etichete
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# tunning pentru k
for k in k_range:
knn = KNeighborsClassifier(n_neighbors = k) #kNeighboursClassifier e un
constructor
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scoruri[k] = (y_test, y_pred)
scoruri_lista.append(metrics.accuracy_source(y_test, y_pred)) #append
pune in capatul listei ultima valoare
#aleg k = 5
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X, y)
# o floare noua
X_nou = [[3,4,5,2], [5,4,2,2]]
y_predict = knn.predict(X_nou)
y_predict
X_train
Lab 11.03
SVM
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#incarcam datele
floricele = pd.read_csv('iris.csv')
print(floricele.head())
print(floricele.info())
#EDA
sns.countplot(x = 'variety', data = floricele)
floricele.describe()
# 50% reprezinta mediana, cea care imparte setul de date in 2 jumatati egale
# 25% cuartilla
X = floricele.drop('variety', axis = 1)
X.head()
y = floricele['variety']
y.tail()
# aplic LabelEncoder
label_encoder = LabelEncoder()
type(label_encoder)
y_encoded = label_encoder.fit_transform(y)
y_encoded
# svm liniar
model = SVC(kernel = 'linear')
# model e o instanta/ un obiect al clasei SVC
model.fit(X_train, y_train)
predictii = model.predict(X_test)
#matricea de confuzie
mat_conf = confusion_matrix(y_test, predictii)
print(mat_conf)
print(classification_report(y_test, predictii))
import pandas as pd
flori = pd.read_csv('iris.csv')
flori.head()
label_encoder = LabelEncoder()
flori['variety'] = label_encoder.fit_transform(flori['variety'])
flori.head()
model = GaussianNB()
model.fit(X_train, y_train)
Curs 26.03
DECISION TREE
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
floricele = pd.read_csv('iris.csv')
X = floricele.drop('variety', axis = 1)
y = floricele['variety']
irisi = load_iris()
irisi.feature_names
df['target'] = irisi.target
irisi.target_names
df.tail()
flori = load_iris()
type(flori)
X = flori.data
y = flori.target
lda = LinearDiscriminantAnalysis(n_components = 2)
X_lda = lda.fit_transform(X, y)
# tema : folosind datele reduse dimensional faceti un sot - base, fara sot -
arbore de decizie
~~ nuj ce am facut aici :
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
model = GaussianNB()
model.fit(X_train, y_train)
Lab 08.04
RANDOMFOREST
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
floricele = pd.read_csv('iris.csv')
# floricele v a deveni un pandas data frame
X = floricele.drop('variety', axis = 1)
y = floricele['variety']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
rf_clasificator.fit(X_train, y_train)
predictii = rf_clasificator.predict(X_test)
#RF cu GridSearchView
from sklearn.model_selection import GridSearchCV #face cross validation
automat
parametrii_grid = {
'n_estimators' : [50, 100, 150],
'max_depth' : [None, 5, 10],
'min_samples_split' : [2, 5, 10],
'min_samples_leaf' : [1, 2, 4],
'max_features' : [None, 'sqrt'],
}
grid_search.fit(X, y)
Lab 22.04
XGBoost
# un exemplu cu XGBoost Classifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib #pentru salvarea (serializarea) modelului
from sklearn import preprocessing
floricele = pd.read_csv('iris.csv')
X = floricele.drop('variety', axis = 1)
y = floricele['variety']
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
clasificatori = {
'Logistic Regression': LogisticRegression(),
'SVM' : SVC(),
'KNN' : KNeighborsClassifier(),
'Decision Tree' : DecisionTreeClassifier(),
'RandomForest' : RandomForestClassifier(),
}
Curs 23.04
KMeans
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
floricele = pd.read_csv('iris.csv')
# X = floricele.drop('variety', axis = 1) sau asa :
X = floricele[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
y_umane = floricele['variety']
#standardizare
scaler = StandardScaler()
X_scalat = scaler.fit_transform(X)
floricele_scalat = pd.DataFrame(X_scalat, columns = X.columns)
inertia = []
for nr_c in range(1,11):
kmeans = KMeans(n_clusters = nr_c, random_state = 42)
kmeans.fit(X_scalat)
inertia.append(kmeans.inertia_)
plt.figure(figsize = (10,6))
plt.plot(range(1,11), inertia, marker = 'o')
plt.title('Metoda cotului pt KMeans')
plt.xlabel('nr. de clustere')
plt.ylabel('inertia')
plt.xticks(range(1,11))
plt.show()
# ...
Curs 29.04
DBScan
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
floricele = pd.read_csv('iris.csv')
X = floricele[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
y_adev = floricele['variety']
label_encoder = LabelEncoder()
y_adev_num = label_encoder.fit_transform(y_adev)
floricele_scalat.head()
dbscan_etichete
# -1 reprezinta noise
plt.figure(figsize = (10,6))
plt.scatter(X_scalat[:,0], X_scalat[:,1], c = dbscan_etichete, cmap = 'viridis')
plt.title('Clustering cu DBSCAN')
plt.xlabel('Sepal Length (scalat)')
plt.ylabel('Sepal width (scalat)')
# evaluare
from sklearn.metrics import silhouette_score, calinski_harabasz_score,
adjusted_rand_score
dbscan_non_noise = (dbscan_etichete != 1)
dbscan_silhouette = silhouette_score(X_scalat[dbscan_non_noise],
dbscan_etichete[dbscan_non_noise])
dbscan_silhouette