Automatic Feature Selection
Automatic Feature Selection
1. Univariate statistics:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
# Assuming you already have `select` fitted and the mask generated
mask = select.get_support()
print(mask)
# Visualize the mask -- black is True (selected), white is False (not selected)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Feature index")
plt.yticks([]) # Optional: Hide y-ticks as they're not needed
plt.title("Feature Selection Mask") # Optional: Add a title
plt.show()
2. Model-based selection:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))
mask = select.get_support()
# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))
3. Iterative selection:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt