0% found this document useful (0 votes)
2 views

vertopal.com_model_training

vertopal tutoirial pdf help to check handout

Uploaded by

drsaheb422
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

vertopal.com_model_training

vertopal tutoirial pdf help to check handout

Uploaded by

drsaheb422
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

import pandas as pd

import numpy as np

df = pd.read_csv(r"..\\notebooks\\data\\gemstone.csv")

df.drop(labels=['id'], axis=1, inplace=True)


df.head()

carat cut color clarity depth table x y z


price
0 1.52 Premium F VS2 62.2 58.0 7.27 7.33 4.55
13619
1 2.03 Very Good J SI2 62.0 58.0 8.06 8.12 5.05
13387
2 0.70 Ideal G VS1 61.2 57.0 5.69 5.73 3.50
2772
3 0.32 Ideal G VS1 61.6 56.0 4.38 4.41 2.71
666
4 1.70 Premium G VS2 62.6 59.0 7.65 7.61 4.77
14453

X = df.drop(labels=['price'], axis=1)
y = df[['price']]

X.head()

carat cut color clarity depth table x y z


0 1.52 Premium F VS2 62.2 58.0 7.27 7.33 4.55
1 2.03 Very Good J SI2 62.0 58.0 8.06 8.12 5.05
2 0.70 Ideal G VS1 61.2 57.0 5.69 5.73 3.50
3 0.32 Ideal G VS1 61.6 56.0 4.38 4.41 2.71
4 1.70 Premium G VS2 62.6 59.0 7.65 7.61 4.77

price
0 13619
1 13387
2 2772
3 666
4 14453
... ...
193568 1130
193569 2874
193570 3036
193571 681
193572 2258

[193573 rows x 1 columns]


# Categorical features
cat_features = X.select_dtypes(include="object").columns
print(cat_features)

Index(['cut', 'color', 'clarity'], dtype='object')

# Numerical features
num_features = X.select_dtypes(exclude="object").columns
print(num_features)

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

# Define the custom ranking for each ordinal variable


cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

from sklearn.impute import SimpleImputer # handling missing values


from sklearn.preprocessing import StandardScaler # handling feature
scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding

## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Numrecial Pipeline
num_pipeline=Pipeline(

steps=[

('imputer',SimpleImputer()), # handling missing values


('scaler', StandardScaler()) # handling scaling of values
]

# Categorical Pipeline
cat_pipeline=Pipeline(

steps=[
('imputer',SimpleImputer(strategy='most_frequent')), #
handling missing values

('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_cate
gories,clarity_categories])) # handling categorical to numerical
conversion
]
)

preprocessor=ColumnTransformer(
[

('num_pipeline',num_pipeline,num_features),
('cat_pipeline',cat_pipeline,cat_features)
]
)

## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,rand
om_state=30)

preprocessor.fit_transform(X_train)

array([[-0.97543926, -0.84960654, -0.12153081, ..., 4. ,


5. , 5. ],
[ 0.2351953 , 1.83363716, -0.12153081, ..., 1. ,
1. , 2. ],
[ 0.49461699, 0.81585507, 0.39980029, ..., 3. ,
3. , 4. ],
...,
[ 0.45138004, 1.55606023, -0.6428619 , ..., 1. ,
3. , 2. ],
[ 0.66756478, -1.77486298, 1.44246248, ..., 4. ,
3. , 4. ],
[ 0.25681377, 0.81585507, -0.12153081, ..., 4. ,
3. , 2. ]])

preprocessor.transform(X_train)

array([[-0.97543926, -0.84960654, -0.12153081, ..., 4. ,


5. , 5. ],
[ 0.2351953 , 1.83363716, -0.12153081, ..., 1. ,
1. , 2. ],
[ 0.49461699, 0.81585507, 0.39980029, ..., 3. ,
3. , 4. ],
...,
[ 0.45138004, 1.55606023, -0.6428619 , ..., 1. ,
3. , 2. ],
[ 0.66756478, -1.77486298, 1.44246248, ..., 4. ,
3. , 4. ],
[ 0.25681377, 0.81585507, -0.12153081, ..., 4. ,
3. , 2. ]])
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
'cat_pipeline__clarity'], dtype=object)

X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=prepr
ocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocesso
r.get_feature_names_out())

X_train

num_pipeline__carat num_pipeline__depth num_pipeline__table


\
0 -0.975439 -0.849607 -0.121531

1 0.235195 1.833637 -0.121531

2 0.494617 0.815855 0.399800

3 -1.018676 0.260701 0.921131

4 -0.953821 -0.664555 -0.642862

... ... ... ...

135496 -1.040295 -0.016876 -0.642862

135497 0.991842 0.168176 -0.642862

135498 0.451380 1.556060 -0.642862

135499 0.667565 -1.774863 1.442462

135500 0.256814 0.815855 -0.121531

num_pipeline__x num_pipeline__y num_pipeline__z


cat_pipeline__cut \
0 -1.042757 -1.080970 -1.123150
4.0
1 0.318447 0.279859 0.485354
1.0
2 0.570855 0.606458 0.673737
3.0
3 -1.214034 -1.244270 -1.195605
3.0
4 -1.069801 -1.044681 -1.094168
4.0
... ... ... ...
...
135496 -1.268122 -1.244270 -1.239078
4.0
135497 1.048629 1.114501 1.079486
4.0
135498 0.516768 0.588314 0.702719
1.0
135499 0.868337 0.951202 0.688228
4.0
135500 0.381549 0.415942 0.470863
4.0

cat_pipeline__color cat_pipeline__clarity
0 5.0 5.0
1 1.0 2.0
2 3.0 4.0
3 3.0 3.0
4 6.0 5.0
... ... ...
135496 1.0 2.0
135497 3.0 1.0
135498 3.0 2.0
135499 3.0 4.0
135500 3.0 2.0

[135501 rows x 9 columns]

X_test

num_pipeline__carat num_pipeline__depth
num_pipeline__table \
0 -0.564688 -0.942132 -0.642862

1 -0.175556 1.000906 -0.121531

2 -1.061913 0.260701 -0.121531

3 0.970223 -0.201927 1.963794

4 -0.932202 -1.312235 0.399800

... ... ... ...

58067 1.013460 1.185958 -0.642862

58068 -0.997058 0.260701 -1.164193

58069 -0.197174 -3.347799 1.442462

58070 -0.824110 -0.201927 -0.121531


58071 2.613227 -0.757081 1.442462

num_pipeline__x num_pipeline__y num_pipeline__z


cat_pipeline__cut \
0 -0.429765 -0.464061 -0.500036
3.0
1 -0.042137 -0.028595 0.036132
2.0
2 -1.304180 -1.298703 -1.268060
4.0
3 1.048629 0.996563 0.978049
3.0
4 -1.006699 -0.990248 -1.065186
3.0
... ... ... ...
...
58067 1.003556 1.041924 1.151941
2.0
58068 -1.141917 -1.126331 -1.108659
4.0
58069 0.102096 0.071199 -0.224706
3.0
58070 -0.853450 -0.881382 -0.876803
4.0
58071 2.139394 2.039865 2.006912
3.0

cat_pipeline__color cat_pipeline__clarity
0 1.0 3.0
1 4.0 2.0
2 4.0 7.0
3 3.0 3.0
4 1.0 4.0
... ... ...
58067 4.0 3.0
58068 2.0 6.0
58069 6.0 3.0
58070 3.0 2.0
58071 6.0 3.0

[58072 rows x 9 columns]

SO far we have only learned Linear Regression and Logistic Regression.. and using it we are
trying to create a End To End project

But, after that, will you be taking the remaining ML Algorithms and End to End project using it?

You might also like