Automatically Select Imputer Parameters
Automatically Select Imputer Parameters
import numpy as np
import pandas as pd
In [34]:
df = pd.read_csv('train.csv')
In [35]:
df.head()
Out[35]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450
Henry
In [36]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
In [37]:
df.head()
Out[37]: Survived Pclass Sex Age SibSp Parch Fare Embarked
In [38]:
X = df.drop(columns=['Survived'])
y = df['Survived']
In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_stat
In [40]:
X_train.head()
In [41]:
numerical_features = ['Age', 'Fare']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
In [42]:
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
In [43]:
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
In [21]:
from sklearn import set_config
set_config(display='diagram')
clf
Out[21]: Pipeline
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=
[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['Age', 'Far
e']),
('cat',
Pipeline(steps=
[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(handle_unknown='ignore'))]),
['Embarked', 'S
ex'])])),
('classifier', LogisticRegression())])
preprocessor: ColumnTransformer
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(s
trategy='median')),
('scaler', Stand
ardScaler())]),
['Age', 'Fare']),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(s
trategy='most_frequent')),
('ohe',
OneHotEncoder(h
andle_unknown='ignore'))]),
['Embarked', 'Sex'])])
num
['Age', 'Fare']
SimpleImputer
SimpleImputer(strategy='median')
StandardScaler
StandardScaler()
cat
['Embarked', 'Sex']
SimpleImputer
SimpleImputer(strategy='most_frequent')
OneHotEncoder
OneHotEncoder(handle_unknown='ignore')
LogisticRegression
LogisticRegression()
In [44]:
param_grid = {
'preprocessor__num__imputer__strategy': ['mean', 'median'],
'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
'classifier__C': [0.1, 1.0, 10, 100]
}
In [45]:
grid_search.fit(X_train, y_train)
print(f"Best params:")
print(grid_search.best_params_)
Best params:
{'classifier__C': 1.0, 'preprocessor__cat__imputer__strategy': 'most_frequ
ent', 'preprocessor__num__imputer__strategy': 'mean'}
In [26]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")
In [46]:
import pandas as pd
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_classifier__C','param_preprocessor__cat__imputer__strategy'
Out[46]: param_classifier__C param_preprocessor__cat__imputer__strategy param_preprocess
4 1 most_frequent
5 1 most_frequent
6 1 constant
7 1 constant
8 10 most_frequent
9 10 most_frequent
10 10 constant
11 10 constant
12 100 most_frequent
13 100 most_frequent
14 100 constant
15 100 constant
0 0.1 most_frequent
1 0.1 most_frequent
2 0.1 constant
3 0.1 constant
In [ ]: