ML 1 16
ML 1 16
2 # Roll_no:2441011
3
4 #Assignment No.:1
5 #Title:Predict the price of the Uber ride from a given pickup
6 #point to the agreed drop-off location.
7 # Perform following tasks:
8 # 1. Pre-process the dataset.
9 # 2. Identify outliers.
10 # 3. Check the correlation.
11 # 4. Implement linear regression and random forest regression models.
12 # 5. Evaluate the models and compare their respective scores like R2, RMSE,etc.
13
14 import pandas as pd
15 import numpy as np
16 import seaborn as sns
17 import matplotlib.pyplot as plt
In [3]: 1 df
Out[3]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_lon
0
2015-
2015-05-07
0 24238194 05-07 7.5 -73.999817 40.738354 -73.9
19:52:06 UTC
19:52:06
2009-
2009-07-17
1 27835199 07-17 7.7 -73.994355 40.728225 -73.9
20:04:56 UTC
20:04:56
2009-
2009-08-24
2 44984355 08-24 12.9 -74.005043 40.740770 -73.9
21:45:00 UTC
21:45:00
2009-
2009-06-26
3 25894730 06-26 5.3 -73.976124 40.790844 -73.9
08:22:21 UTC
8:22:21
2014-
2014-08-28
4 17610152 08-28 16.0 -73.925023 40.744085 -73.9
17:47:00 UTC
17:47:00
2012-
2012-10-28
199995 42598914 10-28 3.0 -73.987042 40.739367 -73.9
10:49:00 UTC
10:49:00
2014-
2014-03-14
199996 16382965 03-14 7.5 -73.984722 40.736837 -74.0
01:09:00 UTC
1:09:00
2009-
2009-06-29
199997 27804658 06-29 30.9 -73.986017 40.756487 -73.8
00:42:00 UTC
0:42:00
2015-
2015-05-20
199998 20259894 05-20 14.5 -73.997124 40.725452 -73.9
14:56:25 UTC
14:56:25
2010-
2010-05-15
199999 11951496 05-15 14.1 -73.984395 40.720077 -73.9
04:08:00 UTC
4:08:00
Out[4]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude
0
2015-
2015-05-07
0 24238194 05-07 7.5 -73.999817 40.738354 -73.999512
19:52:06 UTC
19:52:06
2009-
2009-07-17
1 27835199 07-17 7.7 -73.994355 40.728225 -73.994710
20:04:56 UTC
20:04:56
2009-
2009-08-24
2 44984355 08-24 12.9 -74.005043 40.740770 -73.962565
21:45:00 UTC
21:45:00
2009-
2009-06-26
3 25894730 06-26 5.3 -73.976124 40.790844 -73.965316
08:22:21 UTC
8:22:21
2014-
2014-08-28
4 17610152 08-28 16.0 -73.925023 40.744085 -73.973082
17:47:00 UTC
17:47:00
In [5]: 1 df.tail()
Out[5]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_lon
0
2012-
2012-10-28
199995 42598914 10-28 3.0 -73.987042 40.739367 -73.9
10:49:00 UTC
10:49:00
2014-
2014-03-14
199996 16382965 03-14 7.5 -73.984722 40.736837 -74.0
01:09:00 UTC
1:09:00
2009-
2009-06-29
199997 27804658 06-29 30.9 -73.986017 40.756487 -73.8
00:42:00 UTC
0:42:00
2015-
2015-05-20
199998 20259894 05-20 14.5 -73.997124 40.725452 -73.9
14:56:25 UTC
14:56:25
2010-
2010-05-15
199999 11951496 05-15 14.1 -73.984395 40.720077 -73.9
04:08:00 UTC
4:08:00
In [6]: 1 df.isna().sum()
Out[6]: Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [7]: 1 df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 200000 non-null int64
1 key 200000 non-null object
2 fare_amount 200000 non-null float64
3 pickup_datetime 200000 non-null object
4 pickup_longitude 200000 non-null float64
5 pickup_latitude 200000 non-null float64
6 dropoff_longitude 199999 non-null float64
7 dropoff_latitude 199999 non-null float64
8 passenger_count 200000 non-null int64
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
In [8]: 1 df.dtypes
In [9]: 1 df.shape
Out[9]: (200000, 9)
In [10]: 1 df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])
In [11]: 1 df.dtypes
Out[13]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitud
2015-05-07
0 7.5 -73.999817 40.738354 -73.999512 40.7232
19:52:06+00:00
2009-07-17
1 7.7 -73.994355 40.728225 -73.994710 40.75032
20:04:56+00:00
2009-08-24
2 12.9 -74.005043 40.740770 -73.962565 40.77264
21:45:00+00:00
2009-06-26
3 5.3 -73.976124 40.790844 -73.965316 40.80334
08:22:21+00:00
2014-08-28
4 16.0 -73.925023 40.744085 -73.973082 40.76124
17:47:00+00:00
2012-10-28
199995 3.0 -73.987042 40.739367 -73.986525 40.74029
10:49:00+00:00
2014-03-14
199996 7.5 -73.984722 40.736837 -74.006672 40.73962
01:09:00+00:00
2009-06-29
199997 30.9 -73.986017 40.756487 -73.858957 40.69258
00:42:00+00:00
2015-05-20
199998 14.5 -73.997124 40.725452 -73.983215 40.6954
14:56:25+00:00
2010-05-15
199999 14.1 -73.984395 40.720077 -73.985508 40.76879
04:08:00+00:00
In [14]: 1 df.fillna(0,inplace=True)
In [15]: 1 df.isnull().sum()
Out[15]: fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
In [16]: 1 df=df.assign(hour=df.pickup_datetime.dt.hour,
2 day=df.pickup_datetime.dt.day,
3 month=df.pickup_datetime.dt.month,
4 year=df.pickup_datetime.dt.year,
5 daysofweek=df.pickup_datetime.dt.dayofweek)
In [17]: 1 df
Out[17]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitud
2015-05-07
0 7.5 -73.999817 40.738354 -73.999512 40.7232
19:52:06+00:00
2009-07-17
1 7.7 -73.994355 40.728225 -73.994710 40.75032
20:04:56+00:00
2009-08-24
2 12.9 -74.005043 40.740770 -73.962565 40.77264
21:45:00+00:00
2009-06-26
3 5.3 -73.976124 40.790844 -73.965316 40.80334
08:22:21+00:00
2014-08-28
4 16.0 -73.925023 40.744085 -73.973082 40.76124
17:47:00+00:00
2012-10-28
199995 3.0 -73.987042 40.739367 -73.986525 40.74029
10:49:00+00:00
2014-03-14
199996 7.5 -73.984722 40.736837 -74.006672 40.73962
01:09:00+00:00
2009-06-29
199997 30.9 -73.986017 40.756487 -73.858957 40.69258
00:42:00+00:00
2015-05-20
199998 14.5 -73.997124 40.725452 -73.983215 40.6954
14:56:25+00:00
2010-05-15
199999 14.1 -73.984395 40.720077 -73.985508 40.76879
04:08:00+00:00
In [18]: 1 df=df.drop("pickup_datetime",axis=1)
In [19]: 1 df.plot()
In [20]: 1 df.plot(kind="box")
(11, 11)
In [28]: 1 travel_dist = []
2 for pos in range(len(df['pickup_longitude'])):
3 long1,lati1,long2,lati2 = [df['pickup_longitude']
4 [pos],df['pickup_latitude'][pos],df['dropoff_longitude']
5 [pos],df['dropoff_latitude'][pos]]
6 loc1=(lati1,long1)
7 loc2=(lati2,long2)
8 c = hs.haversine(loc1,loc2)
9 travel_dist.append(c)
10
11
12 print(travel_dist)
13 df['dist_travel_km'] = travel_dist
14 df.head()
Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
Out[28]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count h
In [29]: 1 x=df[['pickup_longitude','pickup_latitude',
2 'dropoff_longitude','dropoff_latitude','passenger_count',
3 'hour','day','month','year','daysofweek','dist_travel_km']]
4 y=df['fare_amount']
5
6
R² Score: 0.6589516503634483
Mean Absolute Error (MAE): 2.157564875844668
Mean Squared Error (MSE): 10.146783070723341
Root Mean Squared Error (RMSE): 3.185401555647787
In [35]: 1
2 def custom_accuracy(y_true, y_pred, tolerance=0.1):
3 return np.mean(np.abs(y_true - y_pred) <= tolerance)
4
5 accuracy = custom_accuracy(y_test, y_pred, tolerance=0.5)
6 print(f'Custom Accuracy: {accuracy}')
7
Out[40]: ▾ RandomForestRegressor
RandomForestRegressor(random_state=42)
Random Forest Model: [ 5.855 12.729 7.338 ... 4.816 5.828 8.495]
In [ ]: 1