SourceCode Assignment1
SourceCode Assignment1
In [1]:
import pandas as pd
In [2]:
df=pd.read_csv('C:/shubhangi/2023-24/LP-III_ML/Assignment 1/uber.csv')
In [3]:
df.head()
Out[3]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217 1
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325 1
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647 1
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349 3
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247 5
17:47:00.000000188 17:47:00 UTC
In [4]:
df=df.drop(['Unnamed: 0','key','pickup_datetime'],axis=1)
In [5]:
df.shape
Out[5]:
(200000, 6)
In [6]:
df.dtypes
Out[6]:
fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object
In [7]:
set(df.dtypes)
Out[7]:
{dtype('int64'), dtype('float64')}
In [8]:
df.dropna()
Out[8]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 1/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [9]:
df.isnull().sum()
Out[9]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [10]:
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace=True)
In [11]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace=True)
In [12]:
df.isnull().sum()
Out[12]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
In [13]:
import plotly.express as px
In [14]:
fig=px.box(df,y='fare_amount')
In [15]:
fig.show()
500
400
300
fare_amount
200
100
In [16]:
x=df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 2/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [17]:
df.describe()[['fare_amount', 'passenger_count']]
Out[17]:
fare_amount passenger_count
In [47]:
import numpy as np
In [48]:
In [49]:
In [50]:
In [52]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 3/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [53]:
Out[53]:
fare_amount Axes(0.125,0.786098;0.352273x0.0939024)
pickup_longitude Axes(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude Axes(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude Axes(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude Axes(0.125,0.560732;0.352273x0.0939024)
passenger_count Axes(0.547727,0.560732;0.352273x0.0939024)
dtype: object
In [54]:
In [56]:
import haversine as hs
In [57]:
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitud
loc1=(lati1,long1)
loc2=(lati2,long2)
c = hs.haversine(loc1,loc2)
travel_dist.append(c)
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 4/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [58]:
print(travel_dist)
df['dist_travel_km'] = travel_dist
df.head()
Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
Out[58]:
In [59]:
In [60]:
In [61]:
Out[61]:
In [62]:
df.isnull().sum()
Out[62]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dist_travel_km 0
dtype: int64
In [63]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 5/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [64]:
Out[64]:
<Axes: >
In [65]:
dist_travel_km
fare_amount 0.786385
pickup_longitude 0.048446
pickup_latitude -0.073362
dropoff_longitude 0.155191
dropoff_latitude -0.052701
passenger_count 0.009884
dist_travel_km 1.000000
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 6/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [66]:
sns.heatmap(df.corr(),annot = True)
Out[66]:
<Axes: >
In [67]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_travel_km']]
y = df['fare_amount']
In [68]:
In [69]:
In [70]:
In [71]:
regression.fit(X_train,y_train)
Out[71]:
▾ LinearRegression
LinearRegression()
In [72]:
regression.intercept_
Out[72]:
4461.8731571535045
In [73]:
regression.coef_
Out[73]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 7/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [74]:
In [75]:
y_test
from sklearn.metrics import r2_score
In [76]:
r2_score(y_test,prediction)
Out[76]:
0.6475045527243914
In [77]:
10.429294359791001
In [78]:
RMSE = np.sqrt(MSE)
print(RMSE)
3.229441803128058
In [79]:
In [80]:
rf = RandomForestRegressor(n_estimators=100)
In [81]:
rf.fit(X_train,y_train)
Out[81]:
▾ RandomForestRegressor
RandomForestRegressor()
In [84]:
y_pred = rf.predict(X_test)
y_pred
Out[84]:
In [85]:
R2_Random = r2_score(y_test,y_pred)
R2_Random
Out[85]:
0.7612178302829902
In [86]:
MSE_Random = mean_squared_error(y_test,y_pred)
In [87]:
print(MSE_Random)
7.064855887063792
In [88]:
RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)
2.657979662650524
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 8/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [89]:
print("OK")
OK
In [ ]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 9/9