Homework2
Homework2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy import stats
from scipy.stats import kruskal
from scipy.stats import chi2_contingency
data = pd.read_csv('Titanic-Dataset.csv')
df = pd.DataFrame(data)
print(df.head(5))
(b) Compare the distribution of imputed values with the original Age column. Do they look
realistic?
imputer = KNNImputer(n_neighbors=5)
df['Age_KNN'] = imputer.fit_transform(df[['Age']])
• 0-12: Child
• 13-19: Teen
• 20-35: Young Adult
• 36-60: Middle Aged
• 60+: Senior
(b) Calculate the survival rate for each Age Group and analyze whether age impacts survival
probability.
df['Age_Group'] = df['Age_KNN'].apply(age_category)
(b) Create a new interaction feature between Pclass and Fare by multiplying them together.
Does this new feature have a stronger correlation with survival?
(b) Compare the number of outliers detected by both methods. Which method do you think is
more robust?
# Z-score Method
z_scores = np.abs(stats.zscore(df['Fare']))
outliers_zscore = df[z_scores > 3]
print("\nNumber of outliers:")
print("IQR method:", len(outliers_iqr))
print("Z-score method:", len(outliers_zscore))
Number of outliers:
IQR method: 116
Z-score method: 20
(b) Plot histograms of the original and transformed Fare. Did the transformation make the data
more normal?
(b) Identify the top three features most correlated with Survival and justify their importance.
variance_features = low_variance_features(df)
print("\nFeatures with sufficient variance:", variance_features)
# (b) Interpretation
print("\nInterpretation:")
if p_value < 0.05:
print("There is a significant difference in survival rates across
passenger classes")
else:
print("No significant difference in survival rates across
passenger classes")
Kruskal-Wallis Test Results:
H-statistic: 102.77351289976991
p-value: 4.819647000539969e-23
Interpretation:
There is a significant difference in survival rates across passenger
classes
(b) Report the model coefficients and interpret whether higher fares increase survival chances.
model = LogisticRegression()
model.fit(X, y)
Interpretation:
Higher fares are associated with increased survival chances
# (b) Interpretation
print("\nInterpretation:")
if p_value < 0.05:
print("Survival is dependent on embarkation point")
else:
print("Survival is independent of embarkation point")
Interpretation:
Survival is dependent on embarkation point
(b) Perform K-Means clustering (k=2) on the PCA-transformed data and analyze whether
clusters align with survival.