Supervised Learning
Example Log Transformation
Import Libraries
import seaborn as sns
import numpy as np
import pandas as pd
Load Data Set and make a copy
tips =sns.load_dataset('tips')
tips1= tips
tips1
Create Box plot to check outliers
sns.boxplot (data = tips1 , x = 'day', y = 'total_bill' )
Create dist plot
sns.distplot(tips1['total_bill'])
Apply log Transformation to address outliers
tips1['total_bill'] = np.log10(tips1['total_bill'])
Create box plot and check outlier again
sns.boxplot (data = tips1 , x = 'day', y = 'total_bill' )
Create dist plot
sns.distplot(tips1['total_bill'])
Save the result in .xls
tips1.to_excel('C:\\Noble\\Training\\DS Temporary Files\\tips.xlsx')
Simple Linear regression –
Import the Libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
Load the Data Set
os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')
os.getcwd()
df1= pd.read_csv('Salary_Data.csv')
print (df1)
create the graph to check the trend
plt.plot(df1["YearsExperience"], df1["Salary"])
plt.show()
Split the data into x and y - Independent and Dependent variable
x = df1.iloc[:,:-1].values
print (x)
y = df1.iloc[:,1].values
print (y)
Split the Data – Train Test split
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
Model fitting
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)
Prediction
y_pred= reg.predict(x_test)
print (y_pred)
y= mx +c (Coefficient and Interceptor Values)
Y= slope
from sklearn.metrics import r2_score
print ('Coefficient', reg.coef_)
print ('Intercept', reg.intercept_)
Accuracy of the model
r2_score(y_test,y_pred)
Final Result in Data Frame
x_final = pd.DataFrame(x,columns= ['Experience'])
y_final = pd.DataFrame(y,columns= ['Salary'])
y_pred_final = pd.DataFrame(y_pred,columns= ['Salary Prediction'])
result = pd.concat([x_final,y_final,y_pred_final], axis =1)
print (result)
result.to_excel("C:\\Noble\\Training\\DS Temporary Files\\Simple
Regression.xlsx")
Create a Graph with predicted numbers
plt.scatter(x_train,y_train)
plt.plot (x_train,reg.predict(x_train),'red' )
predicted graph on test data
plt.scatter(x_test,y_test)
plt.plot (x_train,reg.predict(x_train),'red' )
Prediction for new set of data
y_pred= reg. predict ([[12], [9.6],[8.5], [2.5]])
print (y_pred)
Linear Regression Prediction with Data Frame
Import Libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
Change directory
os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')
os.getcwd()
Load Data Set
df1= pd.read_csv('Salary_Data.csv')
print (df1)
Plot Graph
plt.plot(df1["YearsExperience"], df1["Salary"])
plt.show()
X and Y as Data Frame
x = df1.iloc[:,:-1]
print (x)
y = df1.iloc[:,1]
print (y)
Train Test Split
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
Linear Regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)
Prediction
y_pred= reg.predict(x_test)
print (y_pred)
Coefficient and Intercept
print ('Coefficient', reg.coef_)
print ('Intercept', reg.intercept_)
Accuracy
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)
Export data to excel
y_pred_final = pd.DataFrame(reg.predict(x),columns= ['Salary Prediction'])
result = pd.concat([x,y,y_pred_final], axis =1)
print (result)
result.to_excel("C:\\Noble\\Training\\DS Temporary Files\\Simple
Regression.xlsx")
Multiple Linear regression –
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
Load Data Set
os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')
df1=pd.read_csv('50_Startups.csv')
df1
Split x and y
x = df1.iloc[:,:-1].values
print (x)
y = df1.iloc[:,4].values
print (y)
Label Encoding
from sklearn.preprocessing import LabelEncoder
Label = LabelEncoder()
x[:,3]= Label.fit_transform(x[:,3])
print (x)
One Hot Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])],
remainder='passthrough')
x = np.array(ct.fit_transform(x))
print (x)
Print X as Data Frame
print (pd.DataFrame(x))
Split the data as train , test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split (x,y,test_size =
0.2,random_state= 42)
Create the Model
from sklearn.linear_model import LinearRegression
reg= LinearRegression()
reg.fit(x_train,y_train)
Predictions
y_pred= reg.predict(x_test)
print (y_pred)
Print Result
result = pd.concat([pd.DataFrame(y_pred),pd.DataFrame(y_test)], axis =1)
print (result)
Print Y and Prediction in one data frame - Concat
y_pre= pd.DataFrame(y_pred, columns =['Prediction'])
y_te = pd.DataFrame(y_test,columns= ['Actual'])
x_te = pd.DataFrame(x_test,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_te,y_te,y_pre], axis =1)
print (result)
Accuracy
r2_score(y_test, y_pred)
Regression Coefficient
reg.coef_
Regression Intercept
reg.intercept_
Ordinary Least Square Method
x=x.astype('float64')
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog = x)
reg_ols = reg_ols.fit()
print (reg_ols.summary())
Tune the Model by removing State Column (P Value Greater than 0.05)
Print the Data Frame
pd.DataFrame(x)
Create the OLS Method by removing the variable which has maximum P
Value – Remove Column 4
x_opt=x[:,[0,1,2,3,5]]
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog =x_opt)
reg_ols = reg_ols.fit()
print (reg_ols.summary())
Create the OLS Method by removing the variable which has maximum P
Value – Remove Column last Column
x_opt=x[:,[0,1,2,3]]
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog =x_opt)
reg_ols = reg_ols.fit()
print (reg_ols.summary())
All the variables with P Value < 0.05 removed , create the model again with
new data set
Train test Split
from sklearn.model_selection import train_test_split
xopt_train,xopt_test,y_train,y_test =train_test_split (x_opt,y,test_size =
0.2,random_state= 42)
Create Model
from sklearn.linear_model import LinearRegression
reg= LinearRegression()
reg.fit(xopt_train,y_train)
Prediction
yopt_pred= reg.predict(xopt_test)
print (yopt_pred)
Print Result
result = pd.concat([pd.DataFrame(yopt_pred),pd.DataFrame(y_test)], axis =1)
print (result)
Print Original Data Frame with Predicted Value
yopt_pre= pd.DataFrame(yopt_pred, columns =['Prediction'])
y_te = pd.DataFrame(y_test,columns= ['Actual'])
x_te = pd.DataFrame(x_test,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_te,y_te,yopt_pre], axis =1)
print (result)
Check Accuracy
r2_score(y_test, yopt_pred)
Prediction for All 50 records
yfull_pred= reg.predict(x_opt)
print (yfull_pred)
Accuracy
r2_score(y, yfull_pred)
Create the Model with only column R& D Spend
x_opt=x[:,3:4]
x_opt
Train Test Split
from sklearn.model_selection import train_test_split
xopt_train,xopt_test,y_train,y_test =train_test_split (x_opt,y,test_size =
0.2,random_state= 42)
Print Shape
print (xopt_train.shape)
Create Model with one column
from sklearn.linear_model import LinearRegression
freg= LinearRegression()
freg.fit(xopt_train,y_train)
Prediction and Check accuracy
yone_pred= freg.predict(x_opt)
r2_score(y, yone_pred)
Print the result as Graph
import seaborn as sns
sns.regplot( x = yone_pred, y = y, scatter_kws={"color": "b"}, line_kws={"color":
"r"},ci = None)
Prediction for New Data Set
Load new Data Set
df_Predict=pd.read_csv('50_Startups_Predictions.csv')
df_Predict
Count Number of Records
df_Predict.count()
Create Array
x_Predict = df_Predict.values
print (x_Predict)
Label Encoding
Label_Predict = LabelEncoder()
x_Predict[:,3]= Label_Predict.fit_transform(x_Predict[:,3])
print (x_Predict)
One Hot Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])],
remainder='passthrough')
x_Predict = np.array(ct.fit_transform(x_Predict))
print (x_Predict)
Print X Values
print (pd.DataFrame(x_Predict))
Generate Predicted Values
xone_Predict= x_Predict[:,3:4]
yone_Predict= freg.predict(xone_Predict)
print (yone_Predict)
Display the result as Data Frame – with X
yone_Predict= pd.DataFrame(yone_Predict, columns =['Prediction'])
x_Predict = pd.DataFrame(x_Predict,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_Predict,yone_Predict], axis =1)
print (result)
Display the result with Actual Input Data Set
yone_Predict= pd.DataFrame(yone_Predict, columns =['Prediction'])
result = pd.concat([df_Predict,yone_Predict], axis =1)
print (result)