B58 - Handling Missing Values, Feature - Selection
B58 - Handling Missing Values, Feature - Selection
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
df = pd.read_csv('diabetes.csv')
print(df.isnull().sum())
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
df = pd.read_csv('Placement_Dataset.csv')
X = df.drop('status_Placed', axis=1)
y = df['status_Placed']
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
RandomForestClassifier(random_state=42)
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X.columns)
print("Feature Importances:")
print(feature_importances.sort_values(ascending=False))
Feature Importances:
salary 0.323788
ssc_p 0.234238
degree_p 0.133835
hsc_p 0.119721
mba_p 0.052254
etest_p 0.036298
sl_no 0.033773
workex_Yes 0.019531
specialisation_Mkt&HR 0.012677
gender_M 0.010178
ssc_b_Others 0.006278
degree_t_Sci&Tech 0.004213
hsc_b_Others 0.003628
degree_t_Others 0.003604
hsc_s_Science 0.003567
hsc_s_Commerce 0.002416
dtype: float64
top_features = feature_importances.nlargest(5).index
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]
model_selected = RandomForestClassifier(random_state=42)
model_selected.fit(X_train_selected, y_train)
RandomForestClassifier(random_state=42)
y_pred_selected = model_selected.predict(X_test_selected)
accuracy_selected = accuracy_score(y_test, y_pred_selected)