0% found this document useful (0 votes)
6 views

SourceCode Assignment1

Source of code and password validity

Uploaded by

Omkar Kamble
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views

SourceCode Assignment1

Source of code and password validity

Uploaded by

Omkar Kamble
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [1]:

import pandas as pd

In [2]:

df=pd.read_csv('C:/shubhangi/2023-24/LP-III_ML/Assignment 1/uber.csv')

In [3]:

df.head()

Out[3]:

Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0

2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217 1
19:52:06.0000003 19:52:06 UTC

2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325 1
20:04:56.0000002 20:04:56 UTC

2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647 1
21:45:00.00000061 21:45:00 UTC

2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349 3
08:22:21.0000001 08:22:21 UTC

2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247 5
17:47:00.000000188 17:47:00 UTC

In [4]:

df=df.drop(['Unnamed: 0','key','pickup_datetime'],axis=1)

In [5]:

df.shape

Out[5]:

(200000, 6)

In [6]:

df.dtypes

Out[6]:

fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object

In [7]:

set(df.dtypes)

Out[7]:

{dtype('int64'), dtype('float64')}

In [8]:

df.dropna()

Out[8]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3

4 16.0 -73.925023 40.744085 -73.973082 40.761247 5

... ... ... ... ... ... ...

199995 3.0 -73.987042 40.739367 -73.986525 40.740297 1

199996 7.5 -73.984722 40.736837 -74.006672 40.739620 1

199997 30.9 -73.986017 40.756487 -73.858957 40.692588 2

199998 14.5 -73.997124 40.725452 -73.983215 40.695415 1

199999 14.1 -73.984395 40.720077 -73.985508 40.768793 1

199999 rows × 6 columns

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 1/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [9]:

df.isnull().sum()

Out[9]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64

In [10]:

df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace=True)

In [11]:

df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace=True)

In [12]:

df.isnull().sum()

Out[12]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64

In [13]:

import plotly.express as px

In [14]:

fig=px.box(df,y='fare_amount')

In [15]:

fig.show()

500

400

300
fare_amount

200

100

In [16]:

x=df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 2/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [17]:

df.describe()[['fare_amount', 'passenger_count']]

Out[17]:

fare_amount passenger_count

count 200000.000000 200000.000000

mean 11.359955 1.684535

std 9.901776 1.385997

min -52.000000 0.000000

25% 6.000000 1.000000

50% 8.500000 1.000000

75% 12.500000 2.000000

max 499.000000 208.000000

In [47]:

import numpy as np

In [48]:

def remove_outlier(df1 , col):


Q1 = df1[col].quantile(0.25)
Q3 = df1[col].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1-1.5*IQR
upper_whisker = Q3+1.5*IQR
df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
return df1

In [49]:

def treat_outliers_all(df1 , col_list):


for c in col_list:
df1 = remove_outlier(df , c)
return df1

In [50]:

df = treat_outliers_all(df , df.iloc[: , 0::])

In [52]:

import matplotlib.pyplot as plt

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 3/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [53]:

df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

Out[53]:

fare_amount Axes(0.125,0.786098;0.352273x0.0939024)
pickup_longitude Axes(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude Axes(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude Axes(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude Axes(0.125,0.560732;0.352273x0.0939024)
passenger_count Axes(0.547727,0.560732;0.352273x0.0939024)
dtype: object

In [54]:

pip install haversine

Requirement already satisfied: haversine in c:\programdata\anaconda3\lib\site-packages (2.8.0)


Note: you may need to restart the kernel to use updated packages.

In [56]:

import haversine as hs

In [57]:

travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitud
loc1=(lati1,long1)
loc2=(lati2,long2)
c = hs.haversine(loc1,loc2)
travel_dist.append(c)

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 4/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [58]:

print(travel_dist)
df['dist_travel_km'] = travel_dist
df.head()

IOPub data rate exceeded.


The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Out[58]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

In [59]:

#Uber doesn't travel over 130 kms so minimize the distance


df= df.loc[(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]
print("Remaining observastions in the dataset:", df.shape)

Remaining observastions in the dataset: (200000, 7)

In [60]:

90) and longitude (greater than or less than 180)


) |(df.pickup_latitude < -90)|(df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) |(df.pickup_longitude > 180) |(df.pickup_longitude <

In [61]:

df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')


df.head()

Out[61]:

fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km

0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325

1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593

2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384

3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686

4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088

In [62]:

df.isnull().sum()

Out[62]:

fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dist_travel_km 0
dtype: int64

In [63]:

import seaborn as sns

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 5/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [64]:

sns.heatmap(df.isnull()) #Free for null values

Out[64]:

<Axes: >

In [65]:

corr = df.corr() #Function to find the correlation


print(corr)

fare_amount pickup_longitude pickup_latitude \


fare_amount 1.000000 0.154069 -0.110842
pickup_longitude 0.154069 1.000000 0.259497
pickup_latitude -0.110842 0.259497 1.000000
dropoff_longitude 0.218675 0.425619 0.048889
dropoff_latitude -0.125898 0.073290 0.515714
passenger_count 0.015778 -0.013213 -0.012889
dist_travel_km 0.786385 0.048446 -0.073362

dropoff_longitude dropoff_latitude passenger_count \


fare_amount 0.218675 -0.125898 0.015778
pickup_longitude 0.425619 0.073290 -0.013213
pickup_latitude 0.048889 0.515714 -0.012889
dropoff_longitude 1.000000 0.245667 -0.009303
dropoff_latitude 0.245667 1.000000 -0.006308
passenger_count -0.009303 -0.006308 1.000000
dist_travel_km 0.155191 -0.052701 0.009884

dist_travel_km
fare_amount 0.786385
pickup_longitude 0.048446
pickup_latitude -0.073362
dropoff_longitude 0.155191
dropoff_latitude -0.052701
passenger_count 0.009884
dist_travel_km 1.000000

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 6/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [66]:

sns.heatmap(df.corr(),annot = True)

Out[66]:

<Axes: >

In [67]:

x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_travel_km']]

y = df['fare_amount']

In [68]:

from sklearn.model_selection import train_test_split

In [69]:

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)

In [70]:

from sklearn.linear_model import LinearRegression


regression = LinearRegression()

In [71]:

regression.fit(X_train,y_train)

Out[71]:

▾ LinearRegression
LinearRegression()

In [72]:

regression.intercept_

Out[72]:

4461.8731571535045

In [73]:

regression.coef_

Out[73]:

array([ 26.29632195, -7.60159329, 19.73368384, -18.21120668,


0.05898655, 1.8490378 ])

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 7/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [74]:

prediction = regression.predict(X_test) #To predict the target values


print(prediction)

[ 6.49105246 6.92068004 5.82905968 ... 13.55261447 7.52776996


7.4194044 ]

In [75]:

y_test
from sklearn.metrics import r2_score

In [76]:

r2_score(y_test,prediction)

Out[76]:

0.6475045527243914

In [77]:

from sklearn.metrics import mean_squared_error


MSE = mean_squared_error(y_test,prediction)
print(MSE)

10.429294359791001

In [78]:

RMSE = np.sqrt(MSE)
print(RMSE)

3.229441803128058

In [79]:

from sklearn.ensemble import RandomForestRegressor

In [80]:

rf = RandomForestRegressor(n_estimators=100)

In [81]:

rf.fit(X_train,y_train)

Out[81]:

▾ RandomForestRegressor
RandomForestRegressor()

In [84]:

y_pred = rf.predict(X_test)
y_pred

Out[84]:

array([ 6.209, 6.919, 4.642, ..., 15.599, 8.569, 5.437])

In [85]:

R2_Random = r2_score(y_test,y_pred)
R2_Random

Out[85]:

0.7612178302829902

In [86]:

MSE_Random = mean_squared_error(y_test,y_pred)

In [87]:

print(MSE_Random)

7.064855887063792

In [88]:

RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)

2.657979662650524

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 8/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook

In [89]:

print("OK")

OK

In [ ]:

localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 9/9

You might also like