ProgrammingFinal Q1-3
ProgrammingFinal Q1-3
ipynb - Colaboratory
import requests # The requests library is an
# HTTP library for getting and posting content etc.
import bs4 as bs# BeautifulSoup4 is a Python library
# for pulling data out of HTML and XML code.
# We can query markup languages for specific content
import pandas as pd
#Pandas to convert solution in a dataframe
import numpy as np
# a GET request will download the HTML webpage.
source = requests.get("https://www.worldometers.info/coronavirus/")
source
<Response [200]>
Response 200 means status ok. We will now convert it to beautiful soup.
# beautifulsoup can parse HTML code
soup = bs.BeautifulSoup(source.content, features='html.parser')
type(soup)
bs4.BeautifulSoup
# Extract the world data table
table = soup.find('table', id='main_table_countries_today')
# Find all the rows in the table
rows = table.find_all('tr')
# Extract the column names from the table
headers = [th.text.strip() for th in table.find_all('th')]
# Initialize an empty list to hold the data for each row
data_rows = []
# Extract the data from each row
for tr in table.find_all('tr')[9:]:
data_row = [td.text.strip() for td in tr.find_all('td')]
if data_row[1]:
data_rows.append(data_row)
# Create a Pandas DataFrame from the data
df = pd.DataFrame(data_rows, columns=headers)
df
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 1/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
# Keep only the desired columns
df = df[['Country,Other', 'TotalCases', 'TotalDeaths', 'TotalTests', 'Population']]
# Rename columns
df = df.rename(columns={'Country,Other': 'Country', 'TotalCases': 'Cases', 'TotalDeaths': 'Deaths', 'TotalTests': 'Tests'})
df.head()
# Set the index of the DataFrame to be the country name
df = df.set_index('Country')
# Drop rows with zero or non-numeric data for Total Deaths or Total Tests
df = df[(df['Deaths'] != 0) & (df['Tests'] != 0)]
df
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 2/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
Country
Tokelau 5 1,378
/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:3641: SettingWithCopyWarning:
A value is
China trying to be
503,302set on a
5,272copy of a slice from a DataFrame.
160,000,000 1,448,471,400
Try using .loc[row_indexer,col_indexer] = value instead
231 rows × 4 columns
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-
self[k1] = value[k2]
Cases Deaths Tests Population
Country
# Test per case
df['tests_per_case'] = df['Tests'] / df['Cases']
# Sort it
df = df.sort_values(by='tests_per_case', ascending=False)
df.head(20)
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 3/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
<ipython-input-89-716111094444>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
Country
resources when
Tongaevaluating a16801
country's response
13 to the pandemic.
535009 107749 31.843878
Question 2: Exploring
Hong Kong 2883157 prediction
13451 algorithms
76127725 7604299 on vaccination
26.404294 data.
Gibraltar 20423 111 534283 33704 26.160848
url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv'
df = pd.read_csv(url)
df
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 4/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
2021-
#To ensure that the date column is a Pandas datetime object, we can use the to_datetime function
0 Afghanistan AFG 0.0 0.0 NaN NaN
02-22
df['date'] = pd.to_datetime(df['date'])
2021-
df 1 Afghanistan AFG NaN NaN NaN NaN
02-23
2022-
157692 Zimbabwe ZWE NaN NaN NaN NaN
10-08
2022-
157693 Zimbabwe ZWE 12222754.0 6437808.0 4751270.0 1033676.0
10-09
# Group by date and sum the number of vaccinations per date using the groupby and sum functions
df = df.groupby('date')['total_vaccinations'].sum().reset_index()
df
date total_vaccinations
0 2020-12-02 0.000000e+00
1 2020-12-03 0.000000e+00
2 2020-12-04 5.000000e+00
3 2020-12-05 4.000000e+00
4 2020-12-06 4.000000e+00
# Fill missing values with 0
df = df.fillna(0)
df
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 5/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
date total_vaccinations
0 2020-12-02 0.000000e+00
1 2020-12-03 0.000000e+00
2 2020-12-04 5.000000e+00
3 2020-12-05 4.000000e+00
4 2020-12-06 4.000000e+00
Plotting
813 the deaths
2023-02-23 4.508844e+10
815 2023-02-25
# Set up plot 4.145385e+10
fig, ax = plt.subplots(figsize=(12, 6))
816 rows × 2 columns
ax.set_title("Total Number of COVID-19 Vaccinations Globally")
ax.set_xlabel("Date")
ax.set_ylabel("Total Number of Vaccinations")
# Filter data for dates starting from December 2020 and ending on February 1, 2023
start_date = '2020-12-01'
end_date = '2023-02-01'
df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
# Plot data
ax.plot(df_filtered['date'], df_filtered['total_vaccinations'], linewidth=0.7, color='blue')
# Set x-axis limits to start from December 2020 and end on February 1, 2023
ax.set_xlim(left=mpl_dates.datestr2num(start_date), right=mpl_dates.datestr2num(end_date))
# Format x-axis
date_format = mpl_dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
# Set tick locator for every month
locator = MonthLocator(bymonthday=1)
ax.xaxis.set_major_locator(locator)
fig.autofmt_xdate()
# Rotate x-axis labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
# Show plot
plt.show();
Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 6/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
# Create X and y variables for linear regression
X = df.index.values.reshape(-1, 1)
y = df['total_vaccinations'].values.reshape(-1, 1)
# Train the model
reg = LinearRegression().fit(X, y)
# Generate predictions
y_pred = reg.predict(X)
# Plot the data and the predicted values
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(df['date'], y_pred, label='Predicted')
plt.plot(df['date'], df['total_vaccinations'], label='True')
# Set the title and axis labels
plt.title('Total Number of Vaccinations over Time Globally')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
# Format x-axis
date_format = mpl_dates.DateFormatter('%b %d, %Y')
ax.xaxis.set_major_formatter(date_format)
fig.autofmt_xdate()
# Add a legend
plt.legend()
# Show the plot
plt.show()
# Calculate the mean squared error of the model
mse = mean_squared_error(y, y_pred)
print('Mean squared error:', mse)
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
df_sk = pd.read_csv(url)
# Convert date column to Pandas datetime object
df_sk['date'] = pd.to_datetime(df_sk['date'])
# Filter the data for South Korea
df_sk = df_sk[df_sk['location'] == 'South Korea']
df_sk
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 7/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
South 2021-
131235 KOR 50357.0 39175.0 11358.0 2.0
Korea 02-26
South 2021-
131236 KOR 52325.0 40919.0 11581.0 2.0
Korea 02-27
South 2021-
131237 KOR 53465.0 41947.0 11688.0 2.0
Korea 02-28
South 2021-
131238 KOR 55673.0 43881.0 11959.0 2.0
Korea 03-01
South 2021-
131239 KOR 122214.0 108915.0 12270.0 3.0
Korea 03-02
South 2023-
131958 KOR NaN 44845635.0 44429115.0 NaN
Korea 02-19
South 2023-
131959 KOR NaN 44845775.0 44429290.0 NaN
Korea 02-20
South 2023-
131960 KOR NaN 44846025.0 44429485.0 NaN
Korea 02-21
South 2023-
131961 KOR NaN 44846315.0 44429716.0 NaN
Korea 02-22
South 2023-
131962 KOR NaN 44846559.0 44429953.0 NaN
Korea 02-23
# Define train and test start and end dates
train_start_date = '2021-08-01'
train_end_date = '2021-09-30'
test_start_date = '2021-10-01'
test_end_date = '2021-10-08'
# Filter the data by train and test dates
train_df = df_sk[(df_sk['date'] >= train_start_date) & (df_sk['date'] <= train_end_date)]
test_df = df_sk[(df_sk['date'] >= test_start_date) & (df_sk['date'] <= test_end_date)]
# Extract the target variable (total vaccinations)
y_train = train_df['total_vaccinations'].values.reshape(-1, 1)
# Extract the features (days since the start of the train period)
X_train = (train_df['date'] - pd.to_datetime(train_start_date)).dt.days.values.reshape(-1, 1)
X_train = (train_df['date'] - pd.to_datetime(train_start_date)).dt.days.values.reshape(-1, 1)
# Create a Linear Regression model and fit it to the training data
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
# Choose a range of regularization strengths to try
alphas = np.logspace(-5, 5, num=11)
# Create a Ridge regression model and fit it to the training data
model = RidgeCV(alphas=alphas, cv=5)
model.fit(X_train, y_train)
# Create a pipeline with a StandardScaler and a Ridge model
pipeline = Pipeline([
('scaler', StandardScaler()),
('ridge', Ridge())
])
# Define the parameter grid to search
param_grid = {
'ridge__alpha': np.logspace(-3, 3, num=7),
}
# Perform a grid search using cross-validation
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 8/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
# Print the best parameters and the mean squared error of the best estimator
print('Best Parameters:', grid_search.best_params_)
print('MSE:', -grid_search.best_score_)
Best Parameters: {'ridge__alpha': 1.0}
MSE: 1425516378643.75
# Make predictions for the test set using the best estimator
best_estimator = grid_search.best_estimator_
X_test = (test_df['date'] - pd.to_datetime(train_start_date)).dt.days.values.reshape(-1, 1)
y_test_pred = best_estimator.predict(X_test)
# Calculate the Root Mean Squared Error for the test set
y_test = test_df['total_vaccinations'].values.reshape(-1, 1)
y_test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print('Test RMSE:', rmse)
# Plot the true and predicted values for the test set
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(test_df['date'], y_test, label='True')
plt.plot(test_df['date'], y_test_pred, label='Predicted')
plt.title('COVID-19 Vaccinations in South Korea')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
# Format x-axis
date_format = mpl_dates.DateFormatter('%b %d, %Y')
ax.xaxis.set_major_formatter(date_format)
fig.autofmt_xdate()
plt.legend()
plt.show()
# Load the data for Google and Yahoo stocks and NY temperature files
google_data = pd.read_csv("google.txt", delimiter='\t')
yahoo_data = pd.read_csv("yahoo.txt", delimiter='\t')
ny_temp_data = pd.read_csv("ny.txt", delimiter='\t')
google_data
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 9/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
0 55463 527.21
1 55462 513.48
2 55461 516.00
3 55460 513.46
4 55459 508.28
0 55463 14.40
1 55462 14.17
2 55461 14.04
3 55460 14.18
4 55459 13.86
ny_temp_data
0 48988 52
1 49019 38
2 49047 31
3 49078 66
4 49108 75
197 55044 81
198 55075 71
199 55105 56
200 55136 68
201 55166 48
# Set the date column as the index for each dataframe
google_data.set_index('Modified Julian Date', inplace=True)
yahoo_data.set_index('Modified Julian Date', inplace=True)
ny_temp_data.set_index('Modified Julian Date', inplace=True)
google_data
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 10/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
Stock Value
55463 527.21
55462 513.48
55461 516.00
55460 513.46
55459 508.28
... ...
53242 106.00
53241 104.87
53240 109.40
53237
# Create a figure and axis object 108.31
fig, ax1 = plt.subplots(figsize=(10, 6))
53236 100.34
# Plot the Google stock data
ax1.plot(google_data.index, google_data['Stock Value'], color='green', label='Google Stock Value')
ax1.set_ylabel('Value (Dollars)', fontsize=14, color='purple')
ax1.tick_params(axis='y', labelcolor='green')
# Create a second y-axis on the right side of the plot
ax2 = ax1.twinx()
# Plot the NY temperature data
ax2.plot(ny_temp_data.index, ny_temp_data['Max Temperature'], color='blue', label='NY Mon. High Temp', linestyle='--')
ax2.set_ylabel('Temperature (°F)', fontsize=14, color='blue')
ax2.tick_params(axis='y', labelcolor='blue')
# Set the y-axis limits and labels
ax2.set_ylim(-150, 100)
ax2.set_yticks(range(-150, 101, 50))
ax2.set_yticklabels(['{:.0f}'.format(x) for x in range(-150, 101, 50)])
# Set the title, x-label, and y-label
ax1.set_title('New York Temperature, Google, and Yahoo!', fontsize=16, fontweight='bold')
ax1.set_xlabel('Date (MJD)', fontsize=14)
# Add a legend to the graph
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='center left', fontsize=12)
# Show the graph
plt.show()
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 11/12
26/02/2023, 21:09 PragyaWasan_ProgrammingFinal_Q1-3 .ipynb - Colaboratory
https://colab.research.google.com/drive/1rd_TELvj4andLtYLTs1it0Ad8rqSD8du#scrollTo=JishMpO8AY2q&printMode=true 12/12