vertopal.com_model_training
vertopal.com_model_training
import numpy as np
df = pd.read_csv(r"..\\notebooks\\data\\gemstone.csv")
X = df.drop(labels=['price'], axis=1)
y = df[['price']]
X.head()
price
0 13619
1 13387
2 2772
3 666
4 14453
... ...
193568 1130
193569 2874
193570 3036
193571 681
193572 2258
# Numerical features
num_features = X.select_dtypes(exclude="object").columns
print(num_features)
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Numrecial Pipeline
num_pipeline=Pipeline(
steps=[
# Categorical Pipeline
cat_pipeline=Pipeline(
steps=[
('imputer',SimpleImputer(strategy='most_frequent')), #
handling missing values
('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_cate
gories,clarity_categories])) # handling categorical to numerical
conversion
]
)
preprocessor=ColumnTransformer(
[
('num_pipeline',num_pipeline,num_features),
('cat_pipeline',cat_pipeline,cat_features)
]
)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,rand
om_state=30)
preprocessor.fit_transform(X_train)
preprocessor.transform(X_train)
array(['num_pipeline__carat', 'num_pipeline__depth',
'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
'cat_pipeline__clarity'], dtype=object)
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=prepr
ocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocesso
r.get_feature_names_out())
X_train
cat_pipeline__color cat_pipeline__clarity
0 5.0 5.0
1 1.0 2.0
2 3.0 4.0
3 3.0 3.0
4 6.0 5.0
... ... ...
135496 1.0 2.0
135497 3.0 1.0
135498 3.0 2.0
135499 3.0 4.0
135500 3.0 2.0
X_test
num_pipeline__carat num_pipeline__depth
num_pipeline__table \
0 -0.564688 -0.942132 -0.642862
cat_pipeline__color cat_pipeline__clarity
0 1.0 3.0
1 4.0 2.0
2 4.0 7.0
3 3.0 3.0
4 1.0 4.0
... ... ...
58067 4.0 3.0
58068 2.0 6.0
58069 6.0 3.0
58070 3.0 2.0
58071 6.0 3.0
SO far we have only learned Linear Regression and Logistic Regression.. and using it we are
trying to create a End To End project
But, after that, will you be taking the remaining ML Algorithms and End to End project using it?