0% found this document useful (0 votes)
35 views6 pages

Practical 1

Uploaded by

manasishivarkar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
35 views6 pages

Practical 1

Uploaded by

manasishivarkar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

import pandas as pd

import seaborn as sns

df = pd.read_csv('uber.csv')
df.head()

Unnamed: 0 key fare_amount \


0 24238194 2015-05-07 19:52:06.0000003 7.5
1 27835199 2009-07-17 20:04:56.0000002 7.7
2 44984355 2009-08-24 21:45:00.00000061 12.9
3 25894730 2009-06-26 08:22:21.0000001 5.3
4 17610152 2014-08-28 17:47:00.000000188 16.0

pickup_datetime pickup_longitude pickup_latitude \


0 2015-05-07 19:52:06 UTC -73.999817 40.738354
1 2009-07-17 20:04:56 UTC -73.994355 40.728225
2 2009-08-24 21:45:00 UTC -74.005043 40.740770
3 2009-06-26 08:22:21 UTC -73.976124 40.790844
4 2014-08-28 17:47:00 UTC -73.925023 40.744085

dropoff_longitude dropoff_latitude passenger_count


0 -73.999512 40.723217 1
1 -73.994710 40.750325 1
2 -73.962565 40.772647 1
3 -73.965316 40.803349 3
4 -73.973082 40.761247 5

df.isnull().sum()

Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64

df = df.drop(columns=['Unnamed:
0','key','dropoff_longitude','dropoff_latitude'])

df.dtypes

fare_amount float64
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
passenger_count int64
dtype: object
df['fare_amount'] = df['fare_amount'].astype('int')
# df['pickup_longitude'] = df['pickup_longitude'].astype('int')
# df['pickup_latitude'] = df['pickup_latitude'].astype('int')

df.dtypes

fare_amount int32
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
passenger_count int64
dtype: object

from datetime import datetime


df['pickup_datetime'] =
pd.to_datetime(df['pickup_datetime'],errors='coerce')

df['year'] = df['pickup_datetime'].dt.year
df['month'] =df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['hours'] = df['pickup_datetime'].dt.hour
df['weekday'] = df['pickup_datetime'].dt.weekday

df = df.drop(columns = ['pickup_datetime'])
df

fare_amount pickup_longitude pickup_latitude


passenger_count year \
0 7 -73.999817 40.738354
1 2015
1 7 -73.994355 40.728225
1 2009
2 12 -74.005043 40.740770
1 2009
3 5 -73.976124 40.790844
3 2009
4 16 -73.925023 40.744085
5 2014
... ... ... ... ..
. ...
199995 3 -73.987042 40.739367
1 2012
199996 7 -73.984722 40.736837
1 2014
199997 30 -73.986017 40.756487
2 2009
199998 14 -73.997124 40.725452
1 2015
199999 14 -73.984395 40.720077
1 2010
month day hours weekday
0 5 7 19 3
1 7 17 20 4
2 8 24 21 0
3 6 26 8 4
4 8 28 17 3
... ... ... ... ...
199995 10 28 10 6
199996 3 14 1 4
199997 6 29 0 0
199998 5 20 14 2
199999 5 15 4 5

[200000 rows x 9 columns]

# identify outliers
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x=df['fare_amount'])
plt.show()

df.corr()
fare_amount pickup_longitude pickup_latitude \
fare_amount 1.000000 0.010532 -0.008573
pickup_longitude 0.010532 1.000000 -0.816461
pickup_latitude -0.008573 -0.816461 1.000000
passenger_count 0.010205 -0.000414 -0.001560
year 0.127932 0.009966 -0.010233
month 0.024222 -0.004665 0.004625
day 0.001224 0.005184 -0.008264
hours -0.021455 0.002433 -0.003822
weekday 0.007641 0.000825 -0.002455

passenger_count year month day


hours \
fare_amount 0.010205 0.127932 0.024222 0.001224 -
0.021455
pickup_longitude -0.000414 0.009966 -0.004665 0.005184
0.002433
pickup_latitude -0.001560 -0.010233 0.004625 -0.008264 -
0.003822
passenger_count 1.000000 0.004798 0.009773 0.003252
0.013196
year 0.004798 1.000000 -0.115859 -0.012170
0.002156
month 0.009773 -0.115859 1.000000 -0.017360 -
0.003926
day 0.003252 -0.012170 -0.017360 1.000000
0.004677
hours 0.013196 0.002156 -0.003926 0.004677
1.000000
weekday 0.033196 0.006113 -0.008786 0.005617 -
0.086947

weekday
fare_amount 0.007641
pickup_longitude 0.000825
pickup_latitude -0.002455
passenger_count 0.033196
year 0.006113
month -0.008786
day 0.005617
hours -0.086947
weekday 1.000000

sns.heatmap(df.corr(),annot=True)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


x = df.drop('fare_amount',axis = 1)
y = df['fare_amount']

from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
x_scale = scaler.fit_transform(x)

from sklearn.model_selection import train_test_split


x_train,x_test,y_train,y_test =
train_test_split(x_scale,y,test_size=0.2,random_state=43)

from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

y_pred = lr.predict(x_test)
y_pred
array([10.55116531, 12.61965214, 12.59597574, ..., 9.48363077,
9.93822128, 12.56229707])

from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(n_estimators = 100, random_state=43)
rf.fit(x_train,y_train)

RandomForestRegressor(random_state=43)

y_pred1 = rf.predict(x_test)

y_pred1

array([ 7.29, 8.48, 10.92, ..., 29.27, 8.97, 12.41])

from sklearn.metrics import mean_squared_error, r2_score


import numpy as np

mse_lr = np.sqrt(mean_squared_error(y_test,y_pred))
print('RMSE of Linear Regression: ',mse_lr)
r2_lr = r2_score(y_test,y_pred)
print('R2-Score of Linear Regression: ',r2_lr)

MSE of Linear Regression: 9.827949026067758


R2-Score of Linear Regression: 0.019287259101657295

mse_rf = np.sqrt(mean_squared_error(y_test,y_pred1))
print('RMSE of Random Forest: ',mse_rf)
r2_rf = r2_score(y_test,y_pred1)
print('R2-Score of Random Forest: ',r2_rf)

MSE of Random Forest: 8.589228089760512


R2-Score of Random Forest: 0.2509267327753276

You might also like