Uber ml1 - Jupyter Notebook
Uber ml1 - Jupyter Notebook
In [2]: df = pd.read_csv("uber.csv")
df.head()
Out[2]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pick
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023
17:47:00.000000188 17:47:00 UTC
In [4]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fare_amount 200000 non-null float64
1 pickup_datetime 200000 non-null object
2 pickup_longitude 200000 non-null float64
3 pickup_latitude 200000 non-null float64
4 dropoff_longitude 199999 non-null float64
5 dropoff_latitude 199999 non-null float64
6 passenger_count 200000 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 10.7+ MB
In [5]: df.dropna(how='any',inplace=True)
In [6]: df.isnull().sum()
Out[6]: fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
Boxplots
In [8]: df = df[
(df.pickup_latitude > -90) & (df.pickup_latitude < 90) &
(df.dropoff_latitude > -90) & (df.dropoff_latitude < 90) &
(df.pickup_longitude > -180) & (df.pickup_longitude < 180) &
(df.dropoff_longitude > -180) & (df.dropoff_longitude < 180) &
(df.fare_amount > 0) & (df.passenger_count > 0) & (df.passenger_co
]
km = 2 * 6371 * np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 + np.cos
return km
Out[10]: 0 1.683323
1 2.457590
2 5.036377
3 1.661683
4 4.475450
dtype: float64
Out[11]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude d
2015-05-07
0 7.5 -73.999817 40.738354 -73.999512
19:52:06 UTC
2009-07-17
1 7.7 -73.994355 40.728225 -73.994710
20:04:56 UTC
2009-08-24
2 12.9 -74.005043 40.740770 -73.962565
21:45:00 UTC
2009-06-26
3 5.3 -73.976124 40.790844 -73.965316
08:22:21 UTC
2014-08-28
4 16.0 -73.925023 40.744085 -73.973082
17:47:00 UTC
In [12]: sns.boxplot(data=df,x='Distance')
C:\Users\HP\AppData\Local\Temp\ipykernel_16404\1295461447.py:1: Setti
ngWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
C:\Users\HP\AppData\Local\Temp\ipykernel_16404\2592915223.py:1: Setti
ngWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
In [16]: df.drop(columns=['pickup_datetime','pickup_latitude','pickup_longitude'
C:\Users\HP\AppData\Local\Temp\ipykernel_16404\3782303944.py:1: Setti
ngWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
In [17]: df.head()
Out[17]:
fare_amount passenger_count Distance week_day Year Month Hour
C:\Users\HP\AppData\Local\Temp\ipykernel_16404\3260682206.py:17: Sett
ingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
Out[18]:
fare_amount passenger_count Distance week_day Year Month Hour
In [19]: df.corr()
Out[19]:
fare_amount passenger_count Distance week_day Year Mo
In [20]: sns.scatterplot(y=df['fare_amount'],x=df['Distance'])
In [29]: fit_predict(LinearRegression())
R-squared: 0.6041167920841171
RMSE: 0.6290054895695945
MAE: 0.27552329590959806
C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\sklearn\me
trics\_regression.py:483: FutureWarning: 'squared' is deprecated in v
ersion 1.4 and will be removed in 1.6. To calculate the root mean squ
ared error, use the function'root_mean_squared_error'.
warnings.warn(
R-squared: 0.6522221648884474
RMSE: 0.5895516309915084
MAE: 0.2918258149086775
C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\sklearn\me
trics\_regression.py:483: FutureWarning: 'squared' is deprecated in v
ersion 1.4 and will be removed in 1.6. To calculate the root mean squ
ared error, use the function'root_mean_squared_error'.
warnings.warn(