Assignment 1
Name: Satyajit Shinde
Div: TY AI C Roll No.: 41
PRN: 12211701
Develop and implement a Linear Regression model using the Least
Squares Estimation method to predict a target variable based on a
given dataset. Calculate the sum of squared differences between
the actual and predicted values. The implementation should
include dataset preprocessing, model training, and performance
evaluation using metrics such as Mean Squared Error (MSE).
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("C:\\Users\\user\\Desktop\\Sem 6\\SI\\salary_data.csv")
df.head()
df.shape
# Splitting the data in X and Y
# where, X has independent variable and Y is dependent variable.
X = df.loc[:,"YearsExperience"]
y = df.loc[:,"Salary"]
# Splitting X and Y into X_train, y_train, X_test,y_test
X_train = X.iloc[:21]
y_train = y.iloc[:21]
X_test = X.iloc[21:]
y_test = y.iloc[21:]
X_train,y_train
# Calculating Line Equation
N = len(X_train)
sum_X = sum(X_train)
sum_Y = sum(y_train)
sum_XY = sum(X_train*y_train)
sum_X_square = sum(X_train**2)
b = ((N * sum_XY) - (sum_X * sum_Y))/((N*sum_X_square)-(sum_X**2))
a = (sum_Y - (b*sum_X))/N
# Predicting Value
def pred(a,b,x):
return a + b*x
for x in X_train:
print(f"Experience : {x} and expected salary is : {pred(a,b,x)}")
for x in X_test:
print(f"Experience : {x} and expected salary is : {pred(a,b,x)}")
c = pred(a,b,6)
c
# Predcting a test on train-sets
pred_test = pred(a,b,X_test)
pred_train =pred(a,b,X_train)
pred_test
pred_train
# plotting Scatter Plot
plt.plot(X_train,pred_train,color="yellow")
plt.scatter(X_train,y_train)
plt.show()
# Plotting Predicted values and Actual values
import matplotlib.pyplot as plt
plt.plot(X_test, pred_test, label='Model Prediction')
plt.scatter(X_test, pred_test, color='red', label='Predicted')
plt.scatter(X_test, y_test, label='Actual')
for x, y in zip(X_test, pred_test):
plt.annotate(f'{y:.2f}', (x, y), textcoords="offset points", xytext=(5, 5), ha='left', color='red')
for x, y in zip(X_test, y_test):
plt.annotate(f'{y:.2f}', (x, y), textcoords="offset points", xytext=(5, 5), ha='right')
for x, y_pred, y_actual in zip(X_test, pred_test, y_test):
plt.plot([x, x], [y_pred, y_actual], color='gray', linestyle='--')
plt.xlabel('X_test')
plt.ylabel('Values')
plt.legend()
plt.show()
#Calculating mean Squared error
error_list = []
def mean_squared_error(true,pred):
squared_error = (true - pred)**2
error_list.append(squared_error)
mse = sum(squared_error) / len(true)
return mse
mse_test = mean_squared_error(y_test,pred_test)
mse_train = mean_squared_error(y_train,pred_train)
print(f"Mean Squared Error for testing set is : {mse_test}")
print(f"Mean Squared Error for training set is : {mse_train}")
def abs_error(true,pred):
error = abs(true -pred)
print(f"Error is:\n{error}")
final = sum(error)
ae = final/len(true)
return ae
error_list_mse = []
error_list_mae = []
for i in range(N):
y_pred = np.array([pred(a,b,x) for x in X_train])
mae = abs_error(y_train,y_pred)
mse = mean_squared_error(y_train,y_pred)
error_list_mse.append(mse)
error_list_mae.append(mae)