SPPUML1
SPPUML1
In [46]: 1 df=pd.read_csv('uber.csv')
In [47]: 1 df
Out[47]: Unnamed:
key fare_amount pickup_datetime pickup_longitude pick
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023
17:47:00.000000188 17:47:00 UTC
2012-10-28 2012-10-28
199995 42598914 3.0 -73.987042
10:49:00.00000053 10:49:00 UTC
2014-03-14 2014-03-14
199996 16382965 7.5 -73.984722
01:09:00.0000008 01:09:00 UTC
2009-06-29 2009-06-29
199997 27804658 30.9 -73.986017
00:42:00.00000078 00:42:00 UTC
2015-05-20 2015-05-20
199998 20259894 14.5 -73.997124
14:56:25.0000004 14:56:25 UTC
2010-05-15 2010-05-15
199999 11951496 14.1 -73.984395
04:08:00.00000076 04:08:00 UTC
In [48]: 1 df.head()
Out[48]: Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_lat
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.73
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.72
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.74
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.79
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.74
17:47:00.000000188 17:47:00 UTC
In [49]: 1 df.shape
Out[49]: (200000, 9)
In [50]: 1 df.tail()
Out[50]: Unnamed:
key fare_amount pickup_datetime pickup_longitude picku
0
2012-10-28 2012-10-28
199995 42598914 3.0 -73.987042
10:49:00.00000053 10:49:00 UTC
2014-03-14 2014-03-14
199996 16382965 7.5 -73.984722
01:09:00.0000008 01:09:00 UTC
2009-06-29 2009-06-29
199997 27804658 30.9 -73.986017
00:42:00.00000078 00:42:00 UTC
2015-05-20 2015-05-20
199998 20259894 14.5 -73.997124
14:56:25.0000004 14:56:25 UTC
2010-05-15 2010-05-15
199999 11951496 14.1 -73.984395
04:08:00.00000076 04:08:00 UTC
In [51]: 1 df.describe()
In [52]: 1 df.isna().sum()
Out[52]: Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [53]: 1 df.fillna(0,inplace=True)
In [54]: 1 df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])
2
3 missing_values = df.isnull().sum()
4 print("Missing values in the dataset:")
5 print(missing_values)
6 df.dropna(inplace=True)
7 missing_values = df.isnull().sum()
8 print("Missing values after handling:")
9 print(missing_values)
10 sns.boxplot(x=df["fare_amount"])
11 plt.show()
Out[58]: 0 7.5
1 7.7
2 12.9
3 5.3
4 16.0
...
199995 3.0
199996 7.5
199997 30.9
199998 14.5
199999 14.1
Name: fare_amount, Length: 200000, dtype: float64
Out[60]: LinearRegression()
In [61]: 1 rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
2 rf_model.fit(X_train, y_train)
Out[61]: RandomForestRegressor(random_state=42)
In [ ]: 1
In [ ]: 1