0% found this document useful (0 votes)
5 views4 pages

Automatic Feature Selection

Uploaded by

bvinnuroiroi467
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views4 pages

Automatic Feature Selection

Uploaded by

bvinnuroiroi467
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

Automatic Feature Selection

1. Univariate statistics:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split

# Load the breast cancer dataset


cancer = load_breast_cancer()

# Create a deterministic random number generator


rng = np.random.RandomState(42)

# Generate noise features


noise = rng.normal(size=(len(cancer.data), 50))

# Add noise features to the dataset


X_w_noise = np.hstack([cancer.data, noise])

# Split the dataset into training and test sets


X_train, X_test, y_train, y_test = train_test_split(
X_w_noise, cancer.target, random_state=0, test_size=0.5
)

# Use SelectPercentile to select the top 50% of features


select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)

# Transform the training set to keep only the selected features


X_train_selected = select.transform(X_train)

# Print the shapes of the original and selected training datasets


print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

import matplotlib.pyplot as plt

# Assuming you already have `select` fitted and the mask generated
mask = select.get_support()
print(mask)

# Visualize the mask -- black is True (selected), white is False (not selected)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Feature index")
plt.yticks([]) # Optional: Hide y-ticks as they're not needed
plt.title("Feature Selection Mask") # Optional: Add a title
plt.show()
2. Model-based selection:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt

# Load the breast cancer dataset


cancer = load_breast_cancer()

# Split the dataset into training and test sets


X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0, test_size=0.5
)

# Initialize SelectFromModel with RandomForestClassifier


select = SelectFromModel(
RandomForestClassifier(n_estimators=100, random_state=42),
threshold="median"
)

# Fit the selector to the training data


select.fit(X_train, y_train)

# Transform the training and test sets


X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

# Fit a RandomForest model on the selected features


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_selected, y_train)

select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))

# Print the accuracy on the test set with selected features


print("Score with selected features: {:.3f}".format(model.score(X_test_selected,
y_test)))

mask = select.get_support()
# visualize the mask -- black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")

# Optional: Visualize the feature importances


importances = model.feature_importances_
plt.bar(range(len(importances)), importances)
plt.title("Feature Importances from Random Forest")
plt.xlabel("Feature index")
plt.ylabel("Importance score")
plt.show()

X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))
3. Iterative selection:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load the breast cancer dataset


cancer = load_breast_cancer()

# Split the dataset into training and test sets


X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0, test_size=0.5
)

# Initialize RFE with RandomForestClassifier


# Adjusting n_features_to_select to a valid number (e.g., 10)
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
n_features_to_select=10)

# Fit the selector to the training data


select.fit(X_train, y_train)

# Get the mask of selected features


mask = select.get_support()

# Visualize the selected features


plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Feature index")
plt.title("Selected Features from RFE")
plt.yticks([]) # Optional: Hide y-ticks as they are not needed
plt.show()

You might also like