Assignment 2 Ds
Assignment 2 Ds
printed and missing values are indicated by the value NaN. Implement the
below methods for handling missing values.
Method 1: Replace missing values with zeros
Method 2: Dropping rows with missing values
Method 3: Replace missing values with Mean/Median/Mode
Method 4: Fill NaN values with the value from the previous rows
Method 5: Fill NaN values with the value from the next rows
Method 6: Fill missing values using interpolation method: Linear Interpolation
import pandas as pd
import numpy as np
df=pd.DataFrame( {
'c1': [1,2,np.nan,4],
'c2': [5,6,7,8],
'c3': [9,10,np.nan,12]} ,dtype='f')
print("Dataframe is:\n",df)
print()
method_1=df.fillna(0)
method_2=df.dropna()
method_3=df.fillna(df.mean())
method_4=df.fillna(method='ffill')
method_5=df.fillna(method='bfill')
method_6=df.interpolate(method='linear')
print("Method 1: Replace missing values with zeros")
print(method_1)
print("\nMethod 2: Dropping rows with missing values")
print(method_2)
print("\nMethod 3: Replace missing values with Mean")
print(method_3)
print("\nMethod 4: Fill NaN values with the value from the previous rows")
print(method_4)
print("\nMethod 5: Fill NaN values with the value from the next rows")
print(method_5)
Q2.Write a python code for carrying out equal width binning for the price
of nine items that are stored in a data frame. For equi-width binning the
minimum and maximum price value are used to three equal width bins
names Low, Medium, and High. Plot a histogram for the three bins based 3
on the price range.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = {
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
data = np.random.normal(loc=0, scale=1, size=1000)
mean = np.mean(data)
std_dev = np.std(data)
data = [5, 8, 6, 10, 12, 7, 8, 15, 20, 22, 25, 28, 30, 32, 35, 38, 40, 42, 45, 50, 55,
60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150,
155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230,
235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 10, 12]
plt.hist(data, bins=20, color='blue', edgecolor='black')
plt.title('Histogram with Outliers')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
data_outliers = [x for x in data if x not in [10, 12]]
plt.hist(data_outliers, bins=20, color='green', edgecolor='black')
plt.title('Histogram without Outliers')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
data = [5, 8, 6, 10, 12, 7, 8, 15, 20, 22, 25, 28, 30, 32, 35, 38, 40, 42, 45, 50, 55,
60, 65, 70, 75, 80,
85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160,
165, 170, 175, 180, 185,
190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260,
265, 270, 275, 280, 285, 290,
295, 10, 12]
plt.hist(data, bins=20, color='red',ec='black')
plt.title('Histogram with Equi-width Bins')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
Q1 = np.quantile(data,0.25)
Q3 = np.quantile(data,0.75)
IQR = Q3 - Q1
LB = Q1 - (1.5 * IQR)
UB = Q3 + (1.5 * IQR)
data_new = []
for x in data:
if LB <= x <= UB:
data_new.append(x)
plt.hist(data_new, bins=20, color='yellow', ec='black')
plt.title('Histogram without Outliers (Interquartile Method)')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
print("Final dataset without outliers:")
print(data_new)