0% found this document useful (0 votes)
19 views

Machine Learning Lab - Preprocessing

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views

Machine Learning Lab - Preprocessing

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

21AI604/ 21CS644

Machine learning lab2


WORKING WITH DATA AND FILES
S.PADMAVATHI, CSE
DATA PREPROCESSING
import numpy as np
from sklearn import preprocessing
# Create feature
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)
# Show feature
scaled_feature

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Preprocessing scaling
import numpy as np # Print mean and standard deviation
from sklearn import preprocessing
print("Mean:", round(standardized.mean()))
# Create feature
print("Standard deviation:",
x = np.array([[-1000.1], [- standardized.std())
200.2],[500.5],[600.6],[9000.9]])
# Create scaler
scaler = preprocessing.StandardScaler()
# Transform the feature
standardized = scaler.fit_transform(x)
# Show feature
standardized

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Preprocessing -Normalize
import numpy as np # Create scaler
from sklearn.preprocessing import Normalizer robust_scaler = preprocessing.RobustScaler()
# Create feature matrix # Transform feature
features = np.array([[0.5, 0.5], robust_scaler.fit_transform(x)
[1.1, 3.4],
[1.5, 20.2],
[1.63, 34.4],
[10.9, 3.3]])
# Create normalizer
normalizer = Normalizer(norm="l2")
# Transform feature matrix
normalizer.transform(features)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Saving data in a file
my_df = pd.DataFrame( for filename in ("my_df.csv", "my_df.html",
"my_df.json"):
[["Biking", 68.5, 1985, np.nan], ["Dancing",
83.1, 1984, 3]], print("#", filename)
with open(filename, "rt") as f:
columns=["hobby","weight","birthyear","childre
n"], print(f.read())
index=["alice", "bob"] print()
)
my_df
my_df.to_csv("my_df.csv")
my_df.to_html("my_df.html")
my_df.to_json("my_df.json")

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Detecting Outliers
#draw ellipse, any observation outside the ellipse # Replace the first observation's values with
as an outlier (labeled as -1) extreme values
# Load libraries features[0,0] = 10000
import numpy as np features[0,1] = 10000
from sklearn.covariance import EllipticEnvelope # Create detector
from sklearn.datasets import make_blobs outlier_detector =
EllipticEnvelope(contamination=.1)
# Create simulated data
# Fit detector
features, _ = make_blobs(n_samples = 10,
outlier_detector.fit(features)
n_features = 2,
# Predict outliers
centers = 1,
outlier_detector.predict(features)
random_state = 1)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


identify extreme values in features using
interquartile range (IQR)
#IQR is the difference between the first and third # Run function
quartile of a set of data
indicies_of_outliers(feature)
# Create one feature
feature = features[:,0]
# Create a function to return index of outliers
def indicies_of_outliers(x):
q1, q3 = np.percentile(x, [25, 75])
iqr = q3 - q1
lower_bound = q1 - (iqr * 1.5)
upper_bound = q3 + (iqr * 1.5)
return np.where((x > upper_bound) | (x <
lower_bound))

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


Deleting Observations with Missing
Values
## Load library
import numpy as np
# Create feature matrix
features = np.array([[1.1, 11.1],
[2.2, 22.2],
[3.3, 33.3],
[4.4, 44.4],
[np.nan, 55]])
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


drop missing observations using pandas
# Load library
import pandas as pd
# Load data
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


#drop the outliers:
# Load library
import pandas as pd
# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]
# Filter observations
houses[houses['Bathrooms'] < 20]

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


#mark fearure as outliers and include it as a #transform the feature to dampen the effect
feature: of the outlier:
# Load library # Log feature
import numpy as np houses["Log_Of_Square_Feet"] = [np.log(x) for
x in houses["Square_Feet"]]
# Create feature based on boolean condition
# Show data
houses["Outlier"] =
np.where(houses["Bathrooms"] < 20, 0, 1) houses
# Show data
houses

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


CONVERT numerical feature into discrete
bins
# # binarize the feature according to some # Create binarizer
threshold:
binarizer = Binarizer(18)
# Load libraries
# Transform feature
import numpy as np
binarizer.fit_transform(age)
from sklearn.preprocessing import Binarizer
# Create feature
#ANOTHER METHOD
age = np.array([[6],
np.digitize(age, bins=[18])
[12],
[20],
[36],
[65]])

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI


break up numerical features according to
multiple thresholds
# Bin feature
#bins parameter denote the left edge of each bin
np.digitize(age, bins=[20,30,64])
#bins parameter denote the RIGHT edge of each bin
np.digitize(age, bins=[20,30,64], right=True)

21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI

You might also like