WORKING WITH DATA AND FILES S.PADMAVATHI, CSE DATA PREPROCESSING import numpy as np from sklearn import preprocessing # Create feature feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]]) # Create scaler minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) # Scale feature scaled_feature = minmax_scale.fit_transform(feature) # Show feature scaled_feature
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
Preprocessing scaling import numpy as np # Print mean and standard deviation from sklearn import preprocessing print("Mean:", round(standardized.mean())) # Create feature print("Standard deviation:", x = np.array([[-1000.1], [- standardized.std()) 200.2],[500.5],[600.6],[9000.9]]) # Create scaler scaler = preprocessing.StandardScaler() # Transform the feature standardized = scaler.fit_transform(x) # Show feature standardized
Saving data in a file my_df = pd.DataFrame( for filename in ("my_df.csv", "my_df.html", "my_df.json"): [["Biking", 68.5, 1985, np.nan], ["Dancing", 83.1, 1984, 3]], print("#", filename) with open(filename, "rt") as f: columns=["hobby","weight","birthyear","childre n"], print(f.read()) index=["alice", "bob"] print() ) my_df my_df.to_csv("my_df.csv") my_df.to_html("my_df.html") my_df.to_json("my_df.json")
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
Detecting Outliers #draw ellipse, any observation outside the ellipse # Replace the first observation's values with as an outlier (labeled as -1) extreme values # Load libraries features[0,0] = 10000 import numpy as np features[0,1] = 10000 from sklearn.covariance import EllipticEnvelope # Create detector from sklearn.datasets import make_blobs outlier_detector = EllipticEnvelope(contamination=.1) # Create simulated data # Fit detector features, _ = make_blobs(n_samples = 10, outlier_detector.fit(features) n_features = 2, # Predict outliers centers = 1, outlier_detector.predict(features) random_state = 1)
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
identify extreme values in features using interquartile range (IQR) #IQR is the difference between the first and third # Run function quartile of a set of data indicies_of_outliers(feature) # Create one feature feature = features[:,0] # Create a function to return index of outliers def indicies_of_outliers(x): q1, q3 = np.percentile(x, [25, 75]) iqr = q3 - q1 lower_bound = q1 - (iqr * 1.5) upper_bound = q3 + (iqr * 1.5) return np.where((x > upper_bound) | (x < lower_bound))
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
Deleting Observations with Missing Values ## Load library import numpy as np # Create feature matrix features = np.array([[1.1, 11.1], [2.2, 22.2], [3.3, 33.3], [4.4, 44.4], [np.nan, 55]]) # Keep only observations that are not (denoted by ~) missing features[~np.isnan(features).any(axis=1)]
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
drop missing observations using pandas # Load library import pandas as pd # Load data dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"]) # Remove observations with missing values dataframe.dropna()
#mark fearure as outliers and include it as a #transform the feature to dampen the effect feature: of the outlier: # Load library # Log feature import numpy as np houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]] # Create feature based on boolean condition # Show data houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1) houses # Show data houses
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
CONVERT numerical feature into discrete bins # # binarize the feature according to some # Create binarizer threshold: binarizer = Binarizer(18) # Load libraries # Transform feature import numpy as np binarizer.fit_transform(age) from sklearn.preprocessing import Binarizer # Create feature #ANOTHER METHOD age = np.array([[6], np.digitize(age, bins=[18]) [12], [20], [36], [65]])
21AI604 / 21CS644 _ML_ DR.S.PADMAVATHI
break up numerical features according to multiple thresholds # Bin feature #bins parameter denote the left edge of each bin np.digitize(age, bins=[20,30,64]) #bins parameter denote the RIGHT edge of each bin np.digitize(age, bins=[20,30,64], right=True)