Assignment Sujith S
Assignment Sujith S
ipynb - Colaboratory
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
#Read the data:
d = pd.read_csv(r"/owid-covid-data.csv")
df = pd.DataFrame(d)
#View the Data:
df.shape
(231176, 67)
df.head()
2020-
0 AFG Asia Afghanistan 5.0 5.0 NaN
02-24
2020-
1 AFG Asia Afghanistan 5.0 0.0 NaN
02-25
2020-
2 AFG Asia Afghanistan 5.0 0.0 NaN
02-26
2020-
3 AFG Asia Afghanistan 5.0 0.0 NaN
02-27
2020-
4 AFG Asia Afghanistan 5.0 0.0 NaN
02-28
5 rows × 67 columns
df.dtypes
iso_code object
continent object
location object
date object
total_cases float64
...
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 1/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
population float64
excess_mortality_cumulative_absolute float64
excess_mortality_cumulative float64
excess_mortality float64
excess_mortality_cumulative_per_million float64
df.describe(include = "all")
2021-
top MEX Europe Mexico NaN NaN
09-14
11 rows × 67 columns
df.columns
'new_deaths_smoothed', 'total_cases_per_million',
'new_cases_per_million', 'new_cases_smoothed_per_million',
'total_deaths_per_million', 'new_deaths_per_million',
'icu_patients_per_million', 'hosp_patients',
'hosp_patients_per_million', 'weekly_icu_admissions',
'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
'total_tests_per_thousand', 'new_tests_per_thousand',
'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 2/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
'new_vaccinations', 'new_vaccinations_smoothed',
'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
'new_vaccinations_smoothed_per_million',
'new_people_vaccinated_smoothed',
'new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
'handwashing_facilities', 'hospital_beds_per_thousand',
'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
'excess_mortality', 'excess_mortality_cumulative_per_million'],
dtype='object')
,'new_cases_per_million','total_cases_per_million'],axis = 1, inplace = True)
# shape of table After dropping some columns
df.shape
(231176, 63)
ndex. In our dataset we will rename the columns:
ion':'Country','continent':'Continent','iso_code':'ISO_code'},inplace = True )
#List the continent name:
continent = list(df.Continent.unique())
continent
#simple imputer:
#Simple imputer helps with missing values in a dataset. In the below code, a simple imputer w
imputer = SimpleImputer(strategy='constant')
df2 = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 3/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
df2
2020-
0 AFG Asia Afghanistan 5.0 5.0 missing_value mis
02-24
2020-
1 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-25
2020-
2 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-26
2020-
3 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-27
2020-
4 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-28
2022-
231171 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-25
2022-
231172 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-26
2022-
231173 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-27
2022-
231174 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-28
2022-
231175 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-29
df2.groupby(['Date','Country'])[['Date','Country','total_cases','total_deaths','total_vaccina
df2
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 4/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
2020-
0 AFG Asia Afghanistan 5.0 5.0 missing_value mis
02-24
2020-
1 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-25
2020-
2 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-26
2020-
3 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-27
2020-
4 AFG Asia Afghanistan 5.0 0.0 missing_value mis
02-28
2022-
231171 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-25
2022-
231172 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-26
2022-
231173 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-27
2022-
df3 = df2.groupby(['Date','Country'])[['Date','Country','total_cases','total_deaths','total_v
231174 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-28
df3.tail(10)
2022-
231175 ZWE Africa Zimbabwe 257893.0 0.0 5606.0
10-29
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 5/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
#change missing_value to 0
231170 Date
2022-10-29 Country total_cases257893.0
Zimbabwe total_deaths 5606.0
total_vaccinations
missing_value
0
231171 2020-01-01
2022-10-30 Argentina Austria 0.0
missing_value 0.0
missing_value 0.0
missing_value
1
231172 2020-01-01
2022-10-30 MexicoGermany 0.0
missing_value 0.0
missing_value 0.0
missing_value
2
231173 2020-01-02
2022-10-30 Argentina Israel 0.0
missing_value 0.0
missing_value 0.0
missing_value
3
231174 2020-01-02
2022-10-30 MexicoMalaysia 0.0
missing_value 0.0
missing_value 0.0
missing_value
4
231175 2020-01-03
2022-10-30 Argentina Russia 0.0
missing_value 0.0
missing_value 0.0
missing_value
... ... ... ... ... ...
#total countries where total_deaths is greater than 1000000
df4=df3[df3['total_deaths']>1000000]
df4
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 6/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
#unique conuntries where total_deaths is greater than 1000000
countries = df4['Country'].unique()
print(len(countries))
print()
print("conuntry_deaths_greater_than_1000000 : ")
print()
conuntry_deaths_greater_than_1000000 = list(df4['Country'].unique())
conuntry_deaths_greater_than_1000000
10
conuntry_deaths_greater_than_1000000 :
['World',
'High income',
'Europe',
'South America',
'Asia',
'North America',
'European Union',
'United States']
New Section
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 7/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
#plotting the trend
for idx in range(0, len(countries)):
C = df4[df4['Country']==countries[idx]].reset_index()
plt.scatter(np.arange(0, len(C)),C['total_cases'],color="blue",label="total_cases"
plt.scatter(np.arange(0, len(C)),C['total_deaths'],color="red",label="total_deaths
plt.scatter(np.arange(0, len(C)),C['total_vaccinations'],color="green", label="tot
plt.title(countries[ idx])
plt.xlabel("Number of days since first suspect")
plt.ylabel("Number of cases")
plt.legend()
plt.show()
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 8/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 9/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 10/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
#group the countries
df5 = df4.groupby(['Country']) [['Country', 'total_cases', 'total_deaths']].sum().reset_index
df5
C = df5
plt.scatter (np.arange(0,len (C)),C['total_cases'], color="blue", label="total_cases")
plt.scatter(np.arange(0,len (C)),C['total_deaths'], color="red", label="total_deaths")
plt.title("World")
plt.xlabel("Number of days since first suspect")
plt.ylabel("Number of cases")
plt.legend()
plt.show()
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 11/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
#analysis by date where total_deaths is greater than 1000000
date = df4['Date'].unique()
len (date)
774
df6 = df4.groupby(['Date']) [[ 'Date', 'total_cases', 'total_deaths']].sum().reset_index()
df6
#graph plotting by Date
C = df6
plt.scatter (np.arange(0,len (C)),C['total_cases'], color="blue", label="total_case
plt.scatter (np.arange(0,len (C)),C['total_deaths' ], color="red", label="total_dea
plt.title("World")
plt.xlabel("Number of days since first suspect")
plt.ylabel("Number of cases")
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 12/13
11/1/22, 12:49 PM assignment.ipynb - Colaboratory
plt.ylabel( Number of cases )
plt.legend()
plt.show()
https://colab.research.google.com/drive/1AJFCKFDfnSSH-YAv3_nCI5B9eA7zmP7v#scrollTo=32WSiJI7AmH2&printMode=true 13/13