Data pipeline in ML
Data pipeline in ML
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
# load dataset
df = pd.read_csv('/content/pima-indians-diabetes.csv', header=None)
df.head()
0 1 2 3 4 5 6 7 8
1 1 85 66 29 0 26.6 0.351 31 0
3 1 89 66 23 94 28.1 0.167 21 0
df.columns = ['Pregnancies','Glucose','BloodPressure','SkinThicness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']
df.head()
0 1 85 66 29 0 26.6 0.351 31 0
2 1 89 66 23 94 28.1 0.167 21 0
X = df.iloc[:, [0,1,2,3,4,5,6,7]] # all the rows and columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThicness',
# 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
y = df.iloc[:, [8]] # all the rows and columns = ['Class]
Creating Pipeline
Creating pipeline for LogisticRegression , DecisionTree and RandomForest models
The pipeline steps will be include
1. Data Preprocessing using MinMaxScaler
2. Reducing Dimentionality using PCA
3. Training the models
LogisticRegressionPipeline = Pipeline([('myscaler',MinMaxScaler()),
('mypca',PCA(n_components=3)),
('logist_regression',LogisticRegression())])
DecisionTreePipeline = Pipeline([('myscaler',MinMaxScaler()),
('mypca',PCA(n_components=3)),
('logist_regression',DecisionTreeClassifier())])
RandomForestPipeline = Pipeline([('myscaler',MinMaxScaler()),
('mypca',PCA(n_components=3)),
('logist_regression',RandomForestClassifier())])
accuracy = 0
# classifier = 0
pipeline = ''