0% found this document useful (0 votes)
17 views10 pages

Untitled1.ipynb - Colab

ml python
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views10 pages

Untitled1.ipynb - Colab

ml python
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

12/28/24, 5:21 PM Untitled1.

ipynb - Colab

import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need


# to convert the string to unicode:

if (sys.version_info < (3, 0)):


csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df

A B C D

0 1.0 2.0 3.0 4.0

1 5.0 6.0 NaN 8.0

2 10 0 11 0 12 0 NaN

Next steps: Generate code with df


toggle_off View recommended plots New interactive sheet

df.isnull().sum()

A 0

B 0

C 1

D 1

dt i t64

# access the underlying NumPy array


# via the `values` attribute
df.values

array([[ 1., 2., 3., 4.],


[ 5., 6., nan, 8.],
[10., 11., 12., nan]])

# remove rows that contain missing values

df.dropna(axis=0)

A B C D

0 10 20 30 40

# remove columns that contain missing values

df.dropna(axis=1)

A B

0 1.0 2.0

1 5.0 6.0

2 10 0 11 0

# remove columns that contain missing values

df.dropna(axis=1)

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 1/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

A B

0 1.0 2.0

1 5.0 6.0

2 10 0 11 0

# only drop rows where all columns are NaN

df.dropna(how='all')

A B C D

0 1.0 2.0 3.0 4.0

1 5.0 6.0 NaN 8.0

2 10 0 11 0 12 0 NaN

# drop rows that have fewer than 3 real values

df.dropna(thresh=4)

A B C D

0 10 20 30 40

# only drop rows where NaN appear in specific columns (here: 'C')

df.dropna(subset=['C'])

A B C D

0 1.0 2.0 3.0 4.0

2 10 0 11 0 12 0 NaN

# again: our original array


df.values

array([[ 1., 2., 3., 4.],


[ 5., 6., nan, 8.],
[10., 11., 12., nan]])

# impute missing values via the column meanC

from sklearn.impute import SimpleImputer


import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')


imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. , 2. , 3. , 4. ],
[ 5. , 6. , 7.5, 8. ],
[10. , 11. , 12. , 6. ]])

df.fillna(df.mean())

A B C D

0 1.0 2.0 3.0 4.0

1 5.0 6.0 7.5 8.0

2 10 0 11 0 12 0 60

import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],


['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']


df

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 2/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

color size price classlabel

0 green M 10.1 class2

1 red L 13.5 class1

2 blue XL 15 3 class2

Next steps: Generate code with df


toggle_off View recommended plots New interactive sheet

size_mapping = {'XL': 3,
'L': 2,
'M': 1}

df['size'] = df['size'].map(size_mapping)
df

color size price classlabel

0 green 1 10.1 class2

1 red 2 13.5 class1

2 blue 3 15 3 class2

Next steps: Generate code with df


toggle_off View recommended plots New interactive sheet

inv_size_mapping = {v: k for k, v in size_mapping.items()}


df['size'].map(inv_size_mapping)

size

0 M

1 L

2 XL

dt bj t

import numpy as np

# create a mapping dict


# to convert class labels from strings to integers
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

# to convert class labels from strings to integers


df['classlabel'] = df['classlabel'].map(class_mapping)
df

color size price classlabel

0 green 1 10.1 1

1 red 2 13.5 0

2 blue 3 15 3 1

Next steps: Generate code with df


toggle_off View recommended plots New interactive sheet

# reverse the class label mapping


inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

color size price classlabel

0 green 1 10.1 class2

1 red 2 13.5 class1

2 blue 3 15 3 class2

Next steps: Generate code with df toggle_off View recommended plots New interactive sheet

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 3/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
from sklearn.preprocessing import LabelEncoder

# Label encoding with sklearn's LabelEncoder


class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

# reverse mapping
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

X = df[['color', 'size', 'price']].values


color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
[2, 2, 13.5],
[0, 3, 15.3]], dtype=object)

from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values


color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],


[0., 0., 1.],
[1., 0., 0.]])

from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values


c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. , 1. , 0. , 1. , 10.1],
[ 0. , 0. , 1. , 2. , 13.5],
[ 1. , 0. , 0. , 3. , 15.3]])

# one-hot encoding via pandas

pd.get_dummies(df[['price', 'color', 'size']])

# multicollinearity guard in get_dummies

pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

price size color_green color_red

0 10.1 1 True False

1 13.5 2 False True

2 15 3 3 False False

# multicollinearity guard for the OneHotEncoder

color_ohe = OneHotEncoder(categories='auto', drop='first')


c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 1. , 0. , 1. , 10.1],
[ 0. , 1. , 2. , 13.5],
[ 0. , 0. , 3. , 15.3]])

df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/wine/wine.data',
header=None)

# if the Wine dataset is temporarily unavailable from the


# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 4/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

# df_wine = pd.read_csv('wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',


'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']

print('Class labels', np.unique(df_wine['Class label']))


df_wine.head()

Class labels [1 2 3]
OD280/OD31
Class Malic Alcalinity Total Nonflavanoid Color
Alcohol Ash Magnesium Flavanoids Proanthocyanins Hue of dilute
label acid of ash phenols phenols intensity
wine

0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.9

1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.4

2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.1

3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.4

Next steps: Generate code with df_wine


toggle_off View recommended plots New interactive sheet

from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test =\


train_test_split(X, y,
test_size=0.3,
random_state=0,
stratify=y)

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

ex = np.array([0, 1, 2, 3, 4, 5])

print('standardized:', (ex - ex.mean()) / ex.std())

# Please note that pandas uses ddof=1 (sample standard deviation)


# by default, whereas NumPy's std method and the StandardScaler
# uses ddof=0 (population standard deviation)

# normalize
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

standardized: [-1.46385011 -0.87831007 -0.29277002 0.29277002 0.87831007 1.46385011]


normalized: [0. 0.2 0.4 0.6 0.8 1. ]

from sklearn.linear_model import LogisticRegression


LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr')

▾ LogisticRegression i ?

LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear')

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')


# Note that C=1.0 is the default. You can increase
# or decrease it to make the regulariztion effect
# stronger or weaker, respectively.
lr.fit(X_train_std, y_train)

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 5/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

Training accuracy: 1.0


Test accuracy: 1.0
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(

lr.intercept_

array([-1.26341044, -1.21608617, -2.3698699 ])

np.set_printoptions(8)

lr.coef_[lr.coef_!=0].shape

(23,)

lr.coef_

import matplotlib.pyplot as plt

fig = plt.figure()
ax = plt.subplot(111)

colors = ['blue', 'green', 'red', 'cyan',


'magenta', 'yellow', 'black',
'pink', 'lightgreen', 'lightblue',
'gray', 'indigo', 'orange']

weights, params = [], []


for c in np.arange(-4., 6.):
lr = LogisticRegression(penalty='l1', C=10.**c, solver='liblinear',
multi_class='ovr', random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)

weights = np.array(weights)

for column, color in zip(range(weights.shape[1]), colors):


plt.plot(params, weights[:, column],
label=df_wine.columns[column + 1],
color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',
bbox_to_anchor=(1.38, 1.03),
ncol=1, fancybox=True)
#plt.savefig('images/04_07.png', dpi=300,
# bbox_inches='tight', pad_inches=0.2)
plt.show()

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 6/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi


warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in versi
warnings.warn(

from sklearn.base import clone


from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state

def fit(self, X, y):

X_train, X_test, y_train, y_test = \


train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)

dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]

while dim > self.k_features:


scores = []
subsets = []

for p in combinations(self.indices_, r=dim - 1):


score = self._calc_score(X_train, y_train,
X_test, y_test, p)
scores.append(score)
subsets.append(p)

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 7/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1

self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]

return self

def transform(self, X):


return X[:, self.indices_]

def _calc_score(self, X_train, y_train, X_test, y_test, indices):


self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score

import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

# selecting features
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

# plotting performance of feature subsets


k_feat = [len(k) for k in sbs.subsets_]

plt.plot(k_feat, sbs.scores_, marker='o')


plt.ylim([0.7, 1.02])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.tight_layout()
# plt.savefig('images/04_08.png', dpi=300)
plt.show()

k3 = list(sbs.subsets_[10])
print(df_wine.columns[1:][k3])

Index(['Alcohol', 'Malic acid', 'OD280/OD315 of diluted wines'], dtype='object')

knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))

Training accuracy: 0.967741935483871


Test accuracy: 0.9629629629629629

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 8/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab
knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))

Training accuracy: 0.9516129032258065


Test accuracy: 0.9259259259259259

from sklearn.ensemble import RandomForestClassifier

feat_labels = df_wine.columns[1:]

forest = RandomForestClassifier(n_estimators=500,
random_state=1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),
importances[indices],
align='center')

plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()

1) Proline 0.185453
2) Flavanoids 0.174751
3) Color intensity 0.143920
4) OD280/OD315 of diluted wines 0.136162
5) Alcohol 0.118529
6) Hue 0.058739
7) Total phenols 0.050872
8) Magnesium 0.031357
9) Malic acid 0.025648
10) Proanthocyanins 0.025570
11) Alcalinity of ash 0.022366
12) Nonflavanoid phenols 0.013354
13) Ash 0.013279

from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.1, prefit=True)


X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:',
X_selected.shape[1])

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 9/10
12/28/24, 5:21 PM Untitled1.ipynb - Colab

Number of features that meet this threshold criterion: 5

for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,

https://colab.research.google.com/drive/1cwddBLFwJo5Ds0rhwuQgezWGXsjeQOLx#scrollTo=i7i9SxHm0GYS&printMode=true 10/10

You might also like