Practical 1
Practical 1
df = pd.read_csv('uber.csv')
df.head()
df.isnull().sum()
Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
df = df.drop(columns=['Unnamed:
0','key','dropoff_longitude','dropoff_latitude'])
df.dtypes
fare_amount float64
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
passenger_count int64
dtype: object
df['fare_amount'] = df['fare_amount'].astype('int')
# df['pickup_longitude'] = df['pickup_longitude'].astype('int')
# df['pickup_latitude'] = df['pickup_latitude'].astype('int')
df.dtypes
fare_amount int32
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
passenger_count int64
dtype: object
df['year'] = df['pickup_datetime'].dt.year
df['month'] =df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['hours'] = df['pickup_datetime'].dt.hour
df['weekday'] = df['pickup_datetime'].dt.weekday
df = df.drop(columns = ['pickup_datetime'])
df
# identify outliers
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x=df['fare_amount'])
plt.show()
df.corr()
fare_amount pickup_longitude pickup_latitude \
fare_amount 1.000000 0.010532 -0.008573
pickup_longitude 0.010532 1.000000 -0.816461
pickup_latitude -0.008573 -0.816461 1.000000
passenger_count 0.010205 -0.000414 -0.001560
year 0.127932 0.009966 -0.010233
month 0.024222 -0.004665 0.004625
day 0.001224 0.005184 -0.008264
hours -0.021455 0.002433 -0.003822
weekday 0.007641 0.000825 -0.002455
weekday
fare_amount 0.007641
pickup_longitude 0.000825
pickup_latitude -0.002455
passenger_count 0.033196
year 0.006113
month -0.008786
day 0.005617
hours -0.086947
weekday 1.000000
sns.heatmap(df.corr(),annot=True)
plt.show
LinearRegression()
y_pred = lr.predict(x_test)
y_pred
array([10.55116531, 12.61965214, 12.59597574, ..., 9.48363077,
9.93822128, 12.56229707])
RandomForestRegressor(random_state=43)
y_pred1 = rf.predict(x_test)
y_pred1
mse_lr = np.sqrt(mean_squared_error(y_test,y_pred))
print('RMSE of Linear Regression: ',mse_lr)
r2_lr = r2_score(y_test,y_pred)
print('R2-Score of Linear Regression: ',r2_lr)
mse_rf = np.sqrt(mean_squared_error(y_test,y_pred1))
print('RMSE of Random Forest: ',mse_rf)
r2_rf = r2_score(y_test,y_pred1)
print('R2-Score of Random Forest: ',r2_rf)