Master Class Python 02
Master Class Python 02
%matplotlib inline
In [19]: import os
os.chdir("C:/Users/SOCIAL DATA/Downloads")
In [22]: data.head()
Out[22]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
1 1 85 66 29 0 26.6 0.351 31 0
3 1 89 66 23 94 28.1 0.167 21 0
In [23]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
Definimos las columnas que son categoricas y numericas
'SkinThickness','Insulin','BMI']
categorical_features = ['Pregnancies']
('simple_imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
categorical_transformer = skPipeline(steps=[
('simple_imputer', SimpleImputer(strategy='most_frequent'))])
ColumnTransformer
Permite que diferentes columnas o subconjuntos de columnas de la entrada se transformen por separado y
las características generadas por cada transformador se concatenarán para formar un único espacio de
características.
preprocessor = ColumnTransformer(
transformers=[
preprocessor
1.Estandarizacion
2.Realizar PCA
4.Realizar la prediccion
Out[8]:
Como podremos notar el orden es necesario para indicar al Pipeline que pasos tiene que realizar.
smt = SMOTE(random_state=42)
pca = PCA(n_components=4)
("pipeline", imbPipeline([
("smt", smt),
("pca",pca),
("classifier", RandomForestClassifier(n_estimators=100))
]))
])
In [74]: clf
Out[74]: Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('simple_imputer',
SimpleImputer(add_indicator=Fa
lse,
copy=True,
fill_value=None,
missing_values=n
an,
strategy='mean',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
RandomForestClassifier(bootstrap=True,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_jobs=None,
oob_score=False,
random_state=None,
verbose=0,
warm_start=False))],
verbose=False))],
verbose=False)
Dividamos la data de entrenamiento y testeo
In [75]: X= data.iloc[:,0:8].values
In [76]: y= data.iloc[:,8].values
'Insulin','BMI','DiabetesPedigreeFunction','Age'])
random_state=0)
In [80]: clf.fit(X_train,y_train)
C:\Users\SOCIAL DATA\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py:72
4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please chang
e the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Out[80]: Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
Pipeline(memory=None,
steps=[('simple_imputer',
SimpleImputer(add_indicator=Fa
lse,
copy=True,
fill_value=None,
missing_values=n
an,
strategy='mean',
verbose=0)),
('scaler',
StandardScaler(copy=True,
with...
RandomForestClassifier(bootstrap=True,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_jobs=None,
oob_score=False,
random_state=None,
verbose=0,
warm_start=False))],
verbose=False))],
verbose=False)
In [14]: clf.named_steps.pipeline.named_steps.pca.explained_variance_
In [15]: clf.named_steps.preprocessor.named_transformers_
In [16]: clf.named_steps.preprocessor.named_transformers_.cat.named_steps.one_hot.categories_
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))
En caso de que queramos obtener los parametros para realizar hiperparametros podemos obtener los
parametros del Pipeline
In [19]: clf.get_params().keys()
("pipeline", imbPipeline([
("smt", smt),
("pca",pca),
("classifier", RandomForestClassifier(n_estimators=100))
]))
])
param_grid = {
'pipeline__pca__n_components': [1,2,3,4],
'pipeline__classifier__max_depth':[2,3,4],
search.
(X_train, y_train)
print(search.best_params_)