0% found this document useful (0 votes)
19 views

Machine Learning Lab - Preprocessing

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views

Machine Learning Lab - Preprocessing

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

21AI604/ 21CS644

Machine learning lab2


WORKING WITH DATA AND FILES
S.PADMAVATHI, CSE
DATA PREPROCESSING
import numpy as np
from sklearn import preprocessing
# Create feature
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)
# Show feature
scaled_feature

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Preprocessing scaling
import numpy as np # Print mean and standard deviation
from sklearn import preprocessing
print("Mean:", round(standardized.mean()))
# Create feature
print("Standard deviation:",
x = np.array([[-1000.1], [- standardized.std())
200.2],[500.5],[600.6],[9000.9]])
# Create scaler
scaler = preprocessing.StandardScaler()
# Transform the feature
standardized = scaler.fit_transform(x)
# Show feature
standardized

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Preprocessing -Normalize
import numpy as np # Create scaler
from sklearn.preprocessing import Normalizer robust_scaler = preprocessing.RobustScaler()
# Create feature matrix # Transform feature
features = np.array([[0.5, 0.5], robust_scaler.fit_transform(x)
[1.1, 3.4],
[1.5, 20.2],
[1.63, 34.4],
[10.9, 3.3]])
# Create normalizer
normalizer = Normalizer(norm="l2")
# Transform feature matrix
normalizer.transform(features)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Saving data in a file
my_df = pd.DataFrame( for filename in ("my_df.csv", "my_df.html",
"my_df.json"):
[["Biking", 68.5, 1985, np.nan], ["Dancing",
83.1, 1984, 3]], print("#", filename)
with open(filename, "rt") as f:
columns=["hobby","weight","birthyear","childre
n"], print(f.read())
index=["alice", "bob"] print()
)
my_df
my_df.to_csv("my_df.csv")
my_df.to_html("my_df.html")
my_df.to_json("my_df.json")

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Detecting Outliers
#draw ellipse, any observation outside the ellipse # Replace the first observation's values with
as an outlier (labeled as -1) extreme values
# Load libraries features[0,0] = 10000
import numpy as np features[0,1] = 10000
from sklearn.covariance import EllipticEnvelope # Create detector
from sklearn.datasets import make_blobs outlier_detector =
EllipticEnvelope(contamination=.1)
# Create simulated data
# Fit detector
features, _ = make_blobs(n_samples = 10,
outlier_detector.fit(features)
n_features = 2,
# Predict outliers
centers = 1,
outlier_detector.predict(features)
random_state = 1)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


identify extreme values in features using
interquartile range (IQR)
#IQR is the difference between the first and third # Run function
quartile of a set of data
indicies_of_outliers(feature)
# Create one feature
feature = features[:,0]
# Create a function to return index of outliers
def indicies_of_outliers(x):
q1, q3 = np.percentile(x, [25, 75])
iqr = q3 - q1
lower_bound = q1 - (iqr * 1.5)
upper_bound = q3 + (iqr * 1.5)
return np.where((x > upper_bound) | (x <
lower_bound))

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Deleting Observations with Missing
Values
## Load library
import numpy as np
# Create feature matrix
features = np.array([[1.1, 11.1],
[2.2, 22.2],
[3.3, 33.3],
[4.4, 44.4],
[np.nan, 55]])
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


drop missing observations using pandas
# Load library
import pandas as pd
# Load data
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


#drop the outliers:
# Load library
import pandas as pd
# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]
# Filter observations
houses[houses['Bathrooms'] < 20]

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


#mark fearure as outliers and include it as a #transform the feature to dampen the effect
feature: of the outlier:
# Load library # Log feature
import numpy as np houses["Log_Of_Square_Feet"] = [np.log(x) for
x in houses["Square_Feet"]]
# Create feature based on boolean condition
# Show data
houses["Outlier"] =
np.where(houses["Bathrooms"] < 20, 0, 1) houses
# Show data
houses

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


CONVERT numerical feature into discrete
bins
# # binarize the feature according to some # Create binarizer
threshold:
binarizer = Binarizer(18)
# Load libraries
# Transform feature
import numpy as np
binarizer.fit_transform(age)
from sklearn.preprocessing import Binarizer
# Create feature
#ANOTHER METHOD
age = np.array([[6],
np.digitize(age, bins=[18])
[12],
[20],
[36],
[65]])

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


break up numerical features according to
multiple thresholds
# Bin feature
#bins parameter denote the left edge of each bin
np.digitize(age, bins=[20,30,64])
#bins parameter denote the RIGHT edge of each bin
np.digitize(age, bins=[20,30,64], right=True)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI

You might also like