0% found this document useful (0 votes)

14 views9 pages

SourceCode Assignment1

Source of code and password validity

Uploaded by

Omkar Kamble

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views9 pages

SourceCode Assignment1

Source of code and password validity

Uploaded by

Omkar Kamble

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [1]:

import pandas as pd

In [2]:

df=pd.read_csv('C:/shubhangi/2023-24/LP-III_ML/Assignment 1/uber.csv')

In [3]:

df.head()

Out[3]:

Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0

2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217 1
19:52:06.0000003 19:52:06 UTC

2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325 1
20:04:56.0000002 20:04:56 UTC

2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647 1
21:45:00.00000061 21:45:00 UTC

2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349 3
08:22:21.0000001 08:22:21 UTC

2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247 5
17:47:00.000000188 17:47:00 UTC

In [4]:

df=df.drop(['Unnamed: 0','key','pickup_datetime'],axis=1)

In [5]:

df.shape

Out[5]:

(200000, 6)

In [6]:

df.dtypes

Out[6]:

fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object

In [7]:

set(df.dtypes)

Out[7]:

{dtype('int64'), dtype('float64')}

In [8]:

df.dropna()

Out[8]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3

4 16.0 -73.925023 40.744085 -73.973082 40.761247 5

... ... ... ... ... ... ...

199995 3.0 -73.987042 40.739367 -73.986525 40.740297 1

199996 7.5 -73.984722 40.736837 -74.006672 40.739620 1

199997 30.9 -73.986017 40.756487 -73.858957 40.692588 2

199998 14.5 -73.997124 40.725452 -73.983215 40.695415 1

199999 14.1 -73.984395 40.720077 -73.985508 40.768793 1

199999 rows × 6 columns

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 1/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [9]:

df.isnull().sum()

Out[9]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64

In [10]:

df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace=True)

In [11]:

df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace=True)

In [12]:

df.isnull().sum()

Out[12]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64

In [13]:

import plotly.express as px

In [14]:

fig=px.box(df,y='fare_amount')

In [15]:

fig.show()

500

400

300
fare_amount

200

100

In [16]:

x=df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 2/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [17]:

df.describe()[['fare_amount', 'passenger_count']]

Out[17]:

fare_amount passenger_count

count 200000.000000 200000.000000

mean 11.359955 1.684535

std 9.901776 1.385997

min -52.000000 0.000000

25% 6.000000 1.000000

50% 8.500000 1.000000

75% 12.500000 2.000000

max 499.000000 208.000000

In [47]:

import numpy as np

In [48]:

def remove_outlier(df1 , col):

Q1 = df1[col].quantile(0.25)
Q3 = df1[col].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1-1.5*IQR
upper_whisker = Q3+1.5*IQR
df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
return df1

In [49]:

def treat_outliers_all(df1 , col_list):

for c in col_list:
df1 = remove_outlier(df , c)
return df1

In [50]:

df = treat_outliers_all(df , df.iloc[: , 0::])

In [52]:

import matplotlib.pyplot as plt

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 3/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [53]:

df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

Out[53]:

fare_amount Axes(0.125,0.786098;0.352273x0.0939024)
pickup_longitude Axes(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude Axes(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude Axes(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude Axes(0.125,0.560732;0.352273x0.0939024)
passenger_count Axes(0.547727,0.560732;0.352273x0.0939024)
dtype: object

In [54]:

pip install haversine

Requirement already satisfied: haversine in c:\programdata\anaconda3\lib\site-packages (2.8.0)

Note: you may need to restart the kernel to use updated packages.

In [56]:

import haversine as hs

In [57]:

travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitud
loc1=(lati1,long1)
loc2=(lati2,long2)
c = hs.haversine(loc1,loc2)
travel_dist.append(c)

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 4/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [58]:

print(travel_dist)
df['dist_travel_km'] = travel_dist
df.head()

IOPub data rate exceeded.

The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Out[58]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

In [59]:

#Uber doesn't travel over 130 kms so minimize the distance

df= df.loc[(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]
print("Remaining observastions in the dataset:", df.shape)

Remaining observastions in the dataset: (200000, 7)

In [60]:

90) and longitude (greater than or less than 180)

In [61]:

df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')

df.head()

Out[61]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

In [62]:

df.isnull().sum()

Out[62]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dist_travel_km 0
dtype: int64

In [63]:

import seaborn as sns

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 5/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [64]:

sns.heatmap(df.isnull()) #Free for null values

Out[64]:

<Axes: >

In [65]:

corr = df.corr() #Function to find the correlation

print(corr)

fare_amount pickup_longitude pickup_latitude \

fare_amount 1.000000 0.154069 -0.110842
pickup_longitude 0.154069 1.000000 0.259497
pickup_latitude -0.110842 0.259497 1.000000
dropoff_longitude 0.218675 0.425619 0.048889
dropoff_latitude -0.125898 0.073290 0.515714
passenger_count 0.015778 -0.013213 -0.012889
dist_travel_km 0.786385 0.048446 -0.073362

dropoff_longitude dropoff_latitude passenger_count \

fare_amount 0.218675 -0.125898 0.015778
pickup_longitude 0.425619 0.073290 -0.013213
pickup_latitude 0.048889 0.515714 -0.012889
dropoff_longitude 1.000000 0.245667 -0.009303
dropoff_latitude 0.245667 1.000000 -0.006308
passenger_count -0.009303 -0.006308 1.000000
dist_travel_km 0.155191 -0.052701 0.009884

dist_travel_km
fare_amount 0.786385
pickup_longitude 0.048446
pickup_latitude -0.073362
dropoff_longitude 0.155191
dropoff_latitude -0.052701
passenger_count 0.009884
dist_travel_km 1.000000

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 6/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [66]:

sns.heatmap(df.corr(),annot = True)

Out[66]:

<Axes: >

In [67]:

x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_travel_km']]

y = df['fare_amount']

In [68]:

from sklearn.model_selection import train_test_split

In [69]:

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)

In [70]:

from sklearn.linear_model import LinearRegression

regression = LinearRegression()

In [71]:

regression.fit(X_train,y_train)

Out[71]:

▾ LinearRegression
LinearRegression()

In [72]:

regression.intercept_

Out[72]:

4461.8731571535045

In [73]:

regression.coef_

Out[73]:

array([ 26.29632195, -7.60159329, 19.73368384, -18.21120668,

0.05898655, 1.8490378 ])

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 7/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [74]:

prediction = regression.predict(X_test) #To predict the target values

print(prediction)

[ 6.49105246 6.92068004 5.82905968 ... 13.55261447 7.52776996

7.4194044 ]

In [75]:

y_test
from sklearn.metrics import r2_score

In [76]:

r2_score(y_test,prediction)

Out[76]:

0.6475045527243914

In [77]:

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test,prediction)
print(MSE)

10.429294359791001

In [78]:

RMSE = np.sqrt(MSE)
print(RMSE)

3.229441803128058

In [79]:

from sklearn.ensemble import RandomForestRegressor

In [80]:

rf = RandomForestRegressor(n_estimators=100)

In [81]:

rf.fit(X_train,y_train)

Out[81]:

▾ RandomForestRegressor
RandomForestRegressor()

In [84]:

y_pred = rf.predict(X_test)
y_pred

Out[84]:

array([ 6.209, 6.919, 4.642, ..., 15.599, 8.569, 5.437])

In [85]:

R2_Random = r2_score(y_test,y_pred)
R2_Random

Out[85]:

0.7612178302829902

In [86]:

MSE_Random = mean_squared_error(y_test,y_pred)

In [87]:

print(MSE_Random)

7.064855887063792

In [88]:

RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)

2.657979662650524

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 8/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [89]:

print("OK")

In [ ]:

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 9/9

Data Analytics All Paper Solution
No ratings yet
Data Analytics All Paper Solution
11 pages
Onebc
No ratings yet
Onebc
2 pages
Uber Price Prediction
No ratings yet
Uber Price Prediction
6 pages
08 Causal Inference I: MSBA7003 Quantitative Analysis Methods
No ratings yet
08 Causal Inference I: MSBA7003 Quantitative Analysis Methods
32 pages
Solution Manual For Microeconometrics
59% (22)
Solution Manual For Microeconometrics
785 pages
ML - 2 - Jupyter Notebook
No ratings yet
ML - 2 - Jupyter Notebook
6 pages
Divvy Exercise R Script
No ratings yet
Divvy Exercise R Script
5 pages
ML 1 Um
No ratings yet
ML 1 Um
5 pages
Predict The Price of The Uber Ride From A Given Pickup Point To The Agreed Drop-Off Location
No ratings yet
Predict The Price of The Uber Ride From A Given Pickup Point To The Agreed Drop-Off Location
9 pages
SHS Core - Statistics and Probability CG
100% (1)
SHS Core - Statistics and Probability CG
11 pages
Name: Siddhesh Asati: #Group: B (ML) #Assignment: 6
No ratings yet
Name: Siddhesh Asati: #Group: B (ML) #Assignment: 6
9 pages
SPPUML1
No ratings yet
SPPUML1
8 pages
EDA Optimising NYC Taxis GautamTiwari - Cleanup
No ratings yet
EDA Optimising NYC Taxis GautamTiwari - Cleanup
1 page
Ml Prac 1 Pratiksha
No ratings yet
Ml Prac 1 Pratiksha
15 pages
Guidebook To Data Analyst
No ratings yet
Guidebook To Data Analyst
51 pages
Practical 1
No ratings yet
Practical 1
6 pages
Ml Prac 1 Urvashi
No ratings yet
Ml Prac 1 Urvashi
15 pages
UBER
No ratings yet
UBER
2 pages
Final Year Dissertation Report FMS MBA F
100% (2)
Final Year Dissertation Report FMS MBA F
44 pages
Kappa 2009 PDF
No ratings yet
Kappa 2009 PDF
48 pages
Uber
No ratings yet
Uber
7 pages
Heckman Selection Models
No ratings yet
Heckman Selection Models
4 pages
Ml-Exp-1 - Jupyter Notebook
No ratings yet
Ml-Exp-1 - Jupyter Notebook
8 pages
Beyond Multiple Linear Regression Applied Generalized Linear Models and Multilevel Models in R 1st Edition Paul Roback
No ratings yet
Beyond Multiple Linear Regression Applied Generalized Linear Models and Multilevel Models in R 1st Edition Paul Roback
71 pages
P1) Code Uber
No ratings yet
P1) Code Uber
6 pages
QMBus Outline s111
No ratings yet
QMBus Outline s111
16 pages
Poe 5 Statatoc
No ratings yet
Poe 5 Statatoc
12 pages
(Ebook) SPSS Demystified: A Simple Guide and Reference by Ronald D Yockey ISBN 9781138286283, 1138286281 instant download full chapters
No ratings yet
(Ebook) SPSS Demystified: A Simple Guide and Reference by Ronald D Yockey ISBN 9781138286283, 1138286281 instant download full chapters
156 pages
Lab1.ipynb - Colaboratory
No ratings yet
Lab1.ipynb - Colaboratory
9 pages
ML 1 16
No ratings yet
ML 1 16
13 pages
Scaffold FG
No ratings yet
Scaffold FG
13 pages
ML Practical 1
No ratings yet
ML Practical 1
15 pages
Find - Flight: Def,, For in If and Return Return None Def
No ratings yet
Find - Flight: Def,, For in If and Return Return None Def
2 pages
# Load The Titanic Dataset: Import As Import As From Import From Import
No ratings yet
# Load The Titanic Dataset: Import As Import As From Import From Import
9 pages
SYJCMATHS2LMR
No ratings yet
SYJCMATHS2LMR
67 pages
Uber ml1 - Jupyter Notebook
No ratings yet
Uber ml1 - Jupyter Notebook
10 pages
Stat Module 1
No ratings yet
Stat Module 1
8 pages
ML - Practical - 1 - Jupyter Notebook
No ratings yet
ML - Practical - 1 - Jupyter Notebook
15 pages
Check Data Types and Data Structures For All The Data Frames - Sapply (Tripdata - 202307, Class) To Sapply (Tripdata - 202406, Class)
No ratings yet
Check Data Types and Data Structures For All The Data Frames - Sapply (Tripdata - 202307, Class) To Sapply (Tripdata - 202406, Class)
9 pages
PG 86 Varun Chitale
No ratings yet
PG 86 Varun Chitale
29 pages
ML Unit 1
No ratings yet
ML Unit 1
124 pages
Project Report
No ratings yet
Project Report
37 pages
Lecture03 MachineLearning
No ratings yet
Lecture03 MachineLearning
78 pages
Untitled 18
No ratings yet
Untitled 18
7 pages
Journal of The Air & Waste Management Association
No ratings yet
Journal of The Air & Waste Management Association
8 pages
Gr5205 Midterm Key
No ratings yet
Gr5205 Midterm Key
13 pages
Fundamental Statistics For The Behavioral Sciences v2.0
No ratings yet
Fundamental Statistics For The Behavioral Sciences v2.0
342 pages
ML A 6 Project
No ratings yet
ML A 6 Project
18 pages
ML All Prints
No ratings yet
ML All Prints
25 pages
Effects of Role Overload
No ratings yet
Effects of Role Overload
5 pages
Nature and Uses of Forecasting
No ratings yet
Nature and Uses of Forecasting
4 pages
ABCA 2 Model Building
No ratings yet
ABCA 2 Model Building
9 pages
Internship Final 1
No ratings yet
Internship Final 1
12 pages
Final Questions For Last Class
No ratings yet
Final Questions For Last Class
5 pages
UBER Data Wrangling
No ratings yet
UBER Data Wrangling
45 pages
Regression Excel Template
No ratings yet
Regression Excel Template
4 pages
Warehouse Cost Estimation
No ratings yet
Warehouse Cost Estimation
60 pages
Chapter 1 - BRODGAR STATISTIC
No ratings yet
Chapter 1 - BRODGAR STATISTIC
4 pages
ML Practical 1 Code
100% (1)
ML Practical 1 Code
1 page
ML Practical 1
No ratings yet
ML Practical 1
15 pages
Case Study 1 Exercise R Script
No ratings yet
Case Study 1 Exercise R Script
5 pages
Decision Science Assignment
No ratings yet
Decision Science Assignment
5 pages
Delhivery Case Study Compressed
No ratings yet
Delhivery Case Study Compressed
31 pages
ML Code Output
No ratings yet
ML Code Output
38 pages
Train Reservation
No ratings yet
Train Reservation
16 pages
Loading The Dataset: First We Load The Dataset and Find Out The Number of Columns, Rows, NULL Values, Etc
100% (1)
Loading The Dataset: First We Load The Dataset and Find Out The Number of Columns, Rows, NULL Values, Etc
8 pages
Praktikum 5
No ratings yet
Praktikum 5
20 pages
Uber - Rides - Analysis - Jupyter Notebook
No ratings yet
Uber - Rides - Analysis - Jupyter Notebook
12 pages
Zahra Ratu Audia - (17821107) - Praktikum 6
100% (2)
Zahra Ratu Audia - (17821107) - Praktikum 6
10 pages
Assignment No 1 Output
No ratings yet
Assignment No 1 Output
42 pages
Shaheed Zulfikar Ali Bhutto Institute of Science & Technology
No ratings yet
Shaheed Zulfikar Ali Bhutto Institute of Science & Technology
12 pages
Merged
No ratings yet
Merged
47 pages
Random Forest Model
No ratings yet
Random Forest Model
16 pages
Analyzing Taxi Trends
No ratings yet
Analyzing Taxi Trends
43 pages
Railway Price Prediction
No ratings yet
Railway Price Prediction
20 pages
Airlanes Booking Analys
No ratings yet
Airlanes Booking Analys
26 pages
Airfare ML - Predicting Flight Fares
No ratings yet
Airfare ML - Predicting Flight Fares
21 pages
Supervised Regression
No ratings yet
Supervised Regression
24 pages
Bose A S
No ratings yet
Bose A S
37 pages
Airline Passenger Booking Analyze
No ratings yet
Airline Passenger Booking Analyze
26 pages
Taxi Trips Analysis Project 1682332303
100% (2)
Taxi Trips Analysis Project 1682332303
28 pages
Taxi Fare Team 09
No ratings yet
Taxi Fare Team 09
25 pages
Delhivery Feature Engineering Cs
No ratings yet
Delhivery Feature Engineering Cs
46 pages
Project On Tour and Travel
No ratings yet
Project On Tour and Travel
12 pages
Flight-Price-Prediction - Flight - Price - Ipynb at Master Mandal-21 - Flight-Price-Prediction
No ratings yet
Flight-Price-Prediction - Flight - Price - Ipynb at Master Mandal-21 - Flight-Price-Prediction
28 pages
SN Travel Jupyter Notebook PDF
No ratings yet
SN Travel Jupyter Notebook PDF
28 pages
PMT2 21
No ratings yet
PMT2 21
39 pages
Flight Price Prediction Capstone Project Submission 2
No ratings yet
Flight Price Prediction Capstone Project Submission 2
69 pages

SourceCode Assignment1

Uploaded by

SourceCode Assignment1

Uploaded by

7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3

4 16.0 -73.925023 40.744085 -73.973082 40.761247 5

... ... ... ... ... ... ...

199995 3.0 -73.987042 40.739367 -73.986525 40.740297 1

199996 7.5 -73.984722 40.736837 -74.006672 40.739620 1

199997 30.9 -73.986017 40.756487 -73.858957 40.692588 2

199998 14.5 -73.997124 40.725452 -73.983215 40.695415 1

199999 14.1 -73.984395 40.720077 -73.985508 40.768793 1

199999 rows × 6 columns

count 200000.000000 200000.000000

mean 11.359955 1.684535

std 9.901776 1.385997

min -52.000000 0.000000

25% 6.000000 1.000000

50% 8.500000 1.000000

75% 12.500000 2.000000

max 499.000000 208.000000

def remove_outlier(df1 , col):

def treat_outliers_all(df1 , col_list):

df = treat_outliers_all(df , df.iloc[: , 0::])

import matplotlib.pyplot as plt

df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

pip install haversine

Requirement already satisfied: haversine in c:\programdata\anaconda3\lib\site-packages (2.8.0)

IOPub data rate exceeded.

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

#Uber doesn't travel over 130 kms so minimize the distance

Remaining observastions in the dataset: (200000, 7)

90) and longitude (greater than or less than 180)

df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

import seaborn as sns

sns.heatmap(df.isnull()) #Free for null values

corr = df.corr() #Function to find the correlation

fare_amount pickup_longitude pickup_latitude \

dropoff_longitude dropoff_latitude passenger_count \

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)

from sklearn.linear_model import LinearRegression

array([ 26.29632195, -7.60159329, 19.73368384, -18.21120668,

prediction = regression.predict(X_test) #To predict the target values

[ 6.49105246 6.92068004 5.82905968 ... 13.55261447 7.52776996

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

array([ 6.209, 6.919, 4.642, ..., 15.599, 8.569, 5.437])

You might also like