LAB EXERCISE 2 - Data Preprocessing
LAB EXERCISE 2 - Data Preprocessing
Data Preprocessing
The variable in the dataset Female and Male can be changed to 0 or 1 using Label Encoder. It is done as
given below:
df_gender_encode=LabelEncoder()
df.gender=df_gender_encode.fit_transform(df.gender)
df.Marks = preprocessing.scale(df.Marks)
scaled_df= preprocessing.scale(df.Marks)
scaled_df_bin = preprocessing.Binarizer(threshold=0.5).transform(newarr)
df_duplicates_removed = pd.DataFrame.drop_duplicates(df_duplicated)
df['m5']=df['m5'].fillna(0)
The command,
df=df.dropna(axis=1)
Listing 1
import pandas as pd
col_list=["id","first","last","gender","Marks","selected"]
df = pd.read_csv("sample.csv",usecols=col_list)
print(df)
print("End of Listing\n\n\n")
df_gender_encode=LabelEncoder()
df.gender=df_gender_encode.fit_transform(df.gender)
print(df)
print("End of Listing\n\n\n")
df.Marks = preprocessing.scale(df.Marks)
scaled_df= preprocessing.scale(df.Marks)
print(df)
newarr = scaled_df.reshape(-1,1)
scaled_df_bin = preprocessing.Binarizer(threshold=0.5).transform(newarr)
df['Marks']=scaled_df_bin
print(df)
Output
import pandas as pd
col_list=["id","first","last","gender","Marks","selected"]
df = pd.read_csv("sample.csv",usecols=col_list)
print(df)
print("End of Listing\n\n\n")
print(df_duplicated)
df_duplicates_removed = pd.DataFrame.drop_duplicates(df_duplicated)
print(df_duplicates_removed)
Output
import pandas as pd
df = pd.DataFrame({
'm1':[50,'A',60,'A',80],
'm2':[60,'A','60','A',80],
'm3':[50,70,'A','A',60],
'm4':[60,'A','A','A',60],
'm5':['A','A','A',10,20]
})
df = df.apply(pd.to_numeric,errors='coerce')
print(df)
df['m5']=df['m5'].fillna(0)
print(df)
df1 = df.copy()
df1['m2'].fillna(df1['m2'].mean(),inplace=True)
print(df1)
df2 = df.copy()
df1['m3'].fillna(df1['m2'].median(),inplace=True)
print(df2)
df=df.dropna(axis=1)
print(df)
Output
This listing illustrates the use of MinMax scaling and Standard scaling for finding Z-scores.
data = asarray([[1,3],[8,5],[6,7],[8,9]])
print(data)
scaler2 = StandardScaler()
scaled1 = scaler1.fit_transform(data)
scaled2 = scaler2.fit_transform(data)
print(scaled1)
print(scaled2)
Output