Assignment2 DMS672
Assignment2 DMS672
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
# 1. Summary Table
summary = []
for col in df.columns:
dtype = df[col].dtype
distinct = df[col].nunique(dropna=False)
if pd.api.types.is_numeric_dtype(df[col]):
mean = df[col].mean()
median = df[col].median()
std = df[col].std()
rng = (df[col].min(), df[col].max())
else:
mean = median = std = None
rng = (None, None)
summary.append({
'Attribute': col,
'Type': str(dtype),
'Distinct': distinct,
'Mean': mean,
'Median': median,
'Std': std,
'Range': f"{rng[0]} – {rng[1]}"
})
summary_df = pd.DataFrame(summary)
display(summary_df)
Attribute Type Distinct Mean Median Std Range
None –
3 Name object 891 NaN NaN NaN
None
None –
4 Sex object 2 NaN NaN NaN
None
0.42 –
5 Age float64 89 29.699118 28.0000 14.526497
80.0
None –
8 Ticket object 681 NaN NaN NaN
None
0.0 –
9 Fare float64 248 32.204208 14.4542 49.693429
512.3292
None –
10 Cabin object 148 NaN NaN NaN
None
None –
11 Embarked object 4 NaN NaN NaN
None
# Histogram of Age
plt.figure(figsize=(6, 4))
df['Age'].hist(bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid(True)
plt.tight_layout()
plt.show()
# Boxplot of Fare
plt.figure(figsize=(6, 4))
df.boxplot(column='Fare', grid=False)
plt.title('Boxplot of Fare')
plt.ylabel('Fare')
plt.tight_layout()
plt.show()
# QQ Plot of Age
plt.figure(figsize=(6, 4))
stats.probplot(df['Age'].dropna(), dist="norm", plot=plt)
plt.title('QQ Plot of Age')
plt.grid(True)
plt.tight_layout()
plt.show()
# Correlation Heatmap
plt.figure(figsize=(6, 5))
corr = df[['Survived', 'Age', 'Fare', 'Pclass', 'SibSp', 'Parch']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()
Boxplot (Fare):
Shows many outliers, especially in the higher fare range. Median Fare is low,
indicating most passengers paid a small amount.
QQ Plot (Age):
Age does not follow a perfect normal distribution; tails deviate from the normal
line.
Correlation Heatmap:
Positive correlation between Fare and Survived. Negative correlation between
Pclass and Survived (i.e., lower class = lower survival).
Pair Plot:
Survivors tend to be in higher classes (Pclass=1) and paid higher fares.
print(" 🔍
Chi-Square Test Results:\n")
for feature in categorical_features:
# Create contingency table
contingency_table = pd.crosstab(df[feature], df['Survived'])
# Append results
chi_square_summary.append({
'Feature': feature,
'Chi-Square': round(chi2_stat, 2),
'p-value': round(p_val, 4),
'DoF': dof,
'Significant (< 0.05)': 'Yes' if p_val < 0.05 else 'No'
})
# -------------------------------
# Survival Rate by Each Feature
📊
# -------------------------------
print("\n Survival Rates by Category:\n")
for feature in ['Sex', 'Pclass', 'Embarked']:
survival_rate = df.groupby(feature)['Survived'].mean().reset_index()
survival_rate.columns = [feature, 'Survival Rate']
print(f"\nSurvival Rate by {feature}:")
display(survival_rate)
0 female 0.742038
1 male 0.188908
0 1 0.629630
1 2 0.472826
2 3 0.242363
0 C 0.553571
1 Q 0.389610
2 S 0.336957
# 2. Survival by Sex
plt.figure(figsize=(6, 4))
sns.barplot(x='Sex', y='Survived', data=df, palette='pastel')
plt.title("Survival Rate by Sex")
plt.ylabel("Survival Rate")
plt.show()
<ipython-input-19-cf2e0f97e5be>:6: FutureWarning:
<ipython-input-19-cf2e0f97e5be>:13: FutureWarning:
<ipython-input-19-cf2e0f97e5be>:49: FutureWarning:
1st class passengers had ~63% survival rate, much higher than 3rd class
(~24%).
Children (age < 10) had higher survival rates, supporting the "women and
children first" approach.
Small families (1–2 siblings/spouses) showed higher survival rates than those
alone or in large groups.
Overall, being female, young, in a higher class, and having family nearby
improved chances of survival.