Linear Regression
Linear Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score
df = pd.read_csv('co2.csv')
df.head()
1 7.7 9.6
2 5.8 5.9
3 9.1 11.1
4 8.7 10.6
df.shape
df.columns
(7385, 8)
df["Fuel Type"].value_counts()
Fuel Type
X 3637
Z 3202
E 370
D 175
N 1
Name: count, dtype: int64
Fuel Type
3 3637
4 3202
1 370
0 175
2 1
Name: count, dtype: int64
Correlation
correlation = df.corr()
correlation
Engine Size(L) Cylinders Fuel Type
\
Engine Size(L) 1.000000 0.927653 0.058296
CO2 Emissions(g/km)
Engine Size(L) 0.851145
Cylinders 0.832644
Fuel Type 0.100306
Fuel Consumption City (L/100 km) 0.919592
Fuel Consumption Hwy (L/100 km) 0.883536
Fuel Consumption Comb (L/100 km) 0.918052
Fuel Consumption Comb (mpg) -0.907426
CO2 Emissions(g/km) 1.000000
<Axes: >
DATA SPLITING
X = df.drop(['CO2 Emissions(g/km)'], axis=1)
Y = df['CO2 Emissions(g/km)']
X.head()
1 7.7 9.6
2 5.8 5.9
3 9.1 11.1
4 8.7 10.6
LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
model= LinearRegression()
model.fit(X_TRAIN, Y_TRAIN)
LinearRegression()
Evaluation
Prediction on Training Data
# accuracy for prediction on training data
training_data_prediction = model.predict(X_TRAIN)
print(training_data_prediction)
# R squared error
score_1 = metrics.r2_score(Y_TRAIN, training_data_prediction)
R squared : 0.9124830358066793
Mean Absolute Error : 11.128722988272692
plt.scatter(Y_TRAIN, training_data_prediction)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Price vs Preicted Price")
plt.show()
Prediction on Test Data
y_pred = model.predict(X_TEST)
y_pred
print(y_pred)
# R squared Score
score_1 = metrics.r2_score(Y_TEST, y_pred)
plt.scatter(Y_TEST, y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Preicted")
plt.show()