Untitled1.ipynb - Colab
Untitled1.ipynb - Colab
ipynb - Colab
import pandas as pd
from io import StringIO
import sys
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df
A B C D
2 10 0 11 0 12 0 NaN
df.isnull().sum()
A 0
B 0
C 1
D 1
dt i t64
df.dropna(axis=0)
A B C D
0 10 20 30 40
df.dropna(axis=1)
A B
0 1.0 2.0
1 5.0 6.0
2 10 0 11 0
df.dropna(axis=1)
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 1/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
A B
0 1.0 2.0
1 5.0 6.0
2 10 0 11 0
df.dropna(how='all')
A B C D
2 10 0 11 0 12 0 NaN
df.dropna(thresh=4)
A B C D
0 10 20 30 40
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])
A B C D
2 10 0 11 0 12 0 NaN
array([[ 1. , 2. , 3. , 4. ],
[ 5. , 6. , 7.5, 8. ],
[10. , 11. , 12. , 6. ]])
df.fillna(df.mean())
A B C D
2 10 0 11 0 12 0 60
import pandas as pd
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 2/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
2 blue XL 15 3 class2
size_mapping = {'XL': 3,
'L': 2,
'M': 1}
df['size'] = df['size'].map(size_mapping)
df
2 blue 3 15 3 class2
size
0 M
1 L
2 XL
dt bj t
import numpy as np
{'class1': 0, 'class2': 1}
0 green 1 10.1 1
1 red 2 13.5 0
2 blue 3 15 3 1
2 blue 3 15 3 class2
Next steps: Generate code with df toggle_off View recommended plots New interactive sheet
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 3/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
from sklearn.preprocessing import LabelEncoder
array([1, 0, 1])
# reverse mapping
class_le.inverse_transform(y)
array([[1, 1, 10.1],
[2, 2, 13.5],
[0, 3, 15.3]], dtype=object)
array([[ 0. , 1. , 0. , 1. , 10.1],
[ 0. , 0. , 1. , 2. , 13.5],
[ 1. , 0. , 0. , 3. , 15.3]])
2 15 3 3 False False
array([[ 1. , 0. , 1. , 10.1],
[ 0. , 1. , 2. , 13.5],
[ 0. , 0. , 3. , 15.3]])
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/wine/wine.data',
header=None)
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 4/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
Class labels [1 2 3]
OD280/OD31
Class Malic Alcalinity Total Nonflavanoid Color
Alcohol Ash Magnesium Flavanoids Proanthocyanins Hue of dilute
label acid of ash phenols phenols intensity
wine
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.9
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.4
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.1
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.4
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
ex = np.array([0, 1, 2, 3, 4, 5])
# normalize
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))
▾ LogisticRegression i ?
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 5/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))
lr.intercept_
np.set_printoptions(8)
lr.coef_[lr.coef_!=0].shape
(23,)
lr.coef_
fig = plt.figure()
ax = plt.subplot(111)
weights = np.array(weights)
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 6/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 7/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]
return self
knn = KNeighborsClassifier(n_neighbors=5)
# selecting features
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)
k3 = list(sbs.subsets_[10])
print(df_wine.columns[1:][k3])
knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 8/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))
feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(n_estimators=500,
random_state=1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),
importances[indices],
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()
1) Proline 0.185453
2) Flavanoids 0.174751
3) Color intensity 0.143920
4) OD280/OD315 of diluted wines 0.136162
5) Alcohol 0.118529
6) Hue 0.058739
7) Total phenols 0.050872
8) Magnesium 0.031357
9) Malic acid 0.025648
10) Proanthocyanins 0.025570
11) Alcalinity of ash 0.022366
12) Nonflavanoid phenols 0.013354
13) Ash 0.013279
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 9/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 10/10