Data Science Assignment Submission
Data Science Assignment Submission
import pandas as pd
import matplotlib.pyplot as mat
import seaborn as sns
df_titanic = pd.read_csv(r'C:\Users\sneha\Desktop\MISM 6212 Data Mining\
Assignment\Week 3\Titanic.csv')
# =======================================
#Claim 1: More people died than survived
# =======================================
df_titanic_survived =
df_titanic.groupby(['Survived']).size().reset_index(name='Counts')
# =======================================
#Claim 3: The third class passengers had the highest chance of survival
# =======================================
df_titanic_cls = df_titanic.groupby(['Pclass',
'Survived']).size().reset_index(name='Counts')
df_titanic_cls['Pct'] =
100*df_titanic_cls['Counts']/df_titanic_cls.groupby('Pclass')
['Counts'].transform('sum')
df_titanic_cls_1 = df_titanic_cls.loc[df_titanic_cls['Survived']==1]
df_titanic_cls_0 = df_titanic_cls.loc[df_titanic_cls['Survived']==0]
# =======================================
#Claim 4: Majority of the people in Titanic were older than 40 years
# =======================================
df_titanic_agegrp = df_titanic
for i,row in df_titanic_agegrp.iterrows():
if row['Age']<=40:
df_titanic_agegrp.loc[i,'Age_group'] = "<= 40 years"
else:
df_titanic_agegrp.loc[i,'Age_group'] = "Greater than 40 years"
df_titanic_age =
df_titanic_agegrp.groupby(['Age_group']).size().reset_index(name='Counts'
)
# =======================================
#Claim 5: Majority of people paid more than 100$ for buying the ticket
# =======================================
df_titanic_faregrp = df_titanic
for i,row in df_titanic_faregrp.iterrows():
if row['Fare']<=100:
df_titanic_faregrp.loc[i,'Fare_group'] = "<= 100$"
else:
df_titanic_faregrp.loc[i,'Fare_group'] = "Greater than 100$"
df_titanic_faregrp.groupby(['Fare_group']).size().plot.pie(autopct =
'%1.1f%%', startangle = 270,title ="Percentage of people in titanic
across fare groups", figsize=(15, 10), fontsize=12, label = 'Percentage
of people')
mat.legend()
mat.show()
Result: Reject the claim since majority (94%)of people paid a fare of
less than or equal to $100.
# =======================================
#Claim 6: Females on an average paid more than males for buying the
ticket
# =======================================
ax = sns.boxplot(x="Sex", y="Fare", data=df_titanic, showmeans = True,
meanprops = {"marker":"s", "markerfacecolor":"black",
"markeredgecolor":"white"})
mat.title("Boxplot of Passenger Fare across Gender")
Result: Accept the claim since mean fare for females is higher than males
(as seen through the black square marker 44.48 > 25.63)
# =======================================
#Claim 7: Passengers in Pclass 3 were younger on average than other
classes
# =======================================
Result: Accept the claim since mean age for passengers in pclass 3 is
lower than other pclasses(as seen through the black square marker)
# =======================================
#Claim 8: Passengers in the first class paid the highest fare
# =======================================
sns.boxplot(x='Pclass',y='Fare',data=df_titanic,showmeans=True)
mat.title("Boxplot of Passenger Fare across Pclass")
Result: Accept the claim as the boxplot indicates that average, median
and range is higher for fares in first class passengers
"""Part 2"""
#Download data for 4 of your favorite stocks
#starting date: 01-01-2019
#end date: today (date you attempt the question)
#plot them on the same graph (only the column "Open" for each stock)
#Use appropriate names for x label, ylabel, and title
#Follow the following specifications
#Figure size: 10*10
#Title font: 25
#xticks and yticks fontsize: 15
#xlabel and ylabel font: 20
#Location of legend: upper left
#Fontsize of legend: 15
import yfinance as yf
df_amzn = yf.download("AMZN",start = "2019-01-01")
df_google = yf.download("GOOG",start = "2019-01-01")
df_tsla = yf.download("TSLA",start = "2019-01-01")
df_wmt = yf.download("WMT",start = "2019-01-01")
mat.figure(figsize=(10,10))
mat.plot(df_amzn['Open'], color = "orange",label = "Amazon")
mat.plot(df_google['Open'], color = "red",label = "Google")
mat.plot(df_tsla['Open'], color = "blue",label = "Tesla")
mat.plot(df_wmt['Open'], color = "green",label = "Walmart")
mat.title("Opening Stock price across time", fontsize = 25)
mat.xlabel("Date", fontsize = 20)
mat.ylabel("Stock price (USD)",fontsize = 20)
mat.xticks(fontsize=15)
mat.yticks(fontsize=15)
mat.legend(loc="2",fontsize=15)
"""Part 3: Refer to file weights. It contains the weight (lbs) of
randomly selected males from United States,
Verify whether the weights seem to be normally distributed"""
#Hint: Check if the distribution of data looks like a bell shaped curve
#check that the mean and median are equal (approximately)
#Check if the data follows the empirical rule
# Empirical rule: For a normal distribution about 68% of the data falls
within one standard deviation,
about 95% percent within two standard deviations, and about 99.7% within
three standard deviations from the mean.
df_weights_dist =
df_weights.groupby(['Group']).size().reset_index(name='Counts')
df_weights_dist['Cumulative_counts'] = df_weights_dist['Counts'].cumsum()
df_weights_dist['cumulative_pct'] =
100*df_weights_dist['Cumulative_counts']/df_weights_dist['Counts'].sum()
Result:
The weights seem to be normally distributed as the density plot looks
like a bell shaped curve. Mean and median are approximately equal to
187.0(overlapping red and blue dotted lines).The data also follows the
empirical rule as the count of observations within each std deviation
follows the empirical rule for normal distribution.
"""Part 4"""
Please submit the code for plotting the following graphs using the pokemon data:
df_pokemon = pd.read_csv(r'C:\Users\sneha\Desktop\MISM 6212 Data Mining\
Data\pokemon_data.csv')
df_pokemon.columns