Code:
#Importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#import dependencies
import pandas as pd
import numpy as np
#import plotly.express as px
#load the data into a dataframe
df = pd.read_csv('uber.csv')
#df1 = pd.read_csv('uber.csv')
#check the first 5 rows
df.head()
#print Dataset
print("Original Dataset")
print(df)
#Data Preprocessing
#drop the unnecessary columns
#df = df.drop(columns=(['Unnamed: 0', 'key','pickup_datetime','pickup_longitude',
'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']))
df = df.drop(['Unnamed: 0', 'key'], axis= 1) #To drop unnamed column as it isn't required
print("Dataset after dropping the unnecessary columns")
print(df)
print(df.dtypes) #To get the type of each column
print(df.shape) #To get the total (Rows,Columns)
print(df.describe()) #To get statistics of each columns
# Filling Missing Values
df.isnull().sum()
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)
df.isnull().sum()
print(df.dtypes)
#Column pickup_datetime is in wrong format (Object). Convert it to DateTime Format
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')
print(df.dtypes)
df= df.assign(hour = df.pickup_datetime.dt.hour,
day= df.pickup_datetime.dt.day,
month = df.pickup_datetime.dt.month,
year = df.pickup_datetime.dt.year,
dayofweek = df.pickup_datetime.dt.dayofweek)
print(df)
# drop the column 'pickup_daetime' using drop()
# 'axis = 1' drops the specified column
df = df.drop('pickup_datetime',axis=1)
print(df)
#Checking outliers and filling them
#plt.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) #Boxplot to check the
outliers
#Using the InterQuartile Range to fill the values
def remove_outlier(df1 , col):
Q1 = df1[col].quantile(0.25)
Q3 = df1[col].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1-1.5*IQR
upper_whisker = Q3+1.5*IQR
df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
return df1
def treat_outliers_all(df1 , col_list):
for c in col_list:
df1 = remove_outlier(df , c)
return df1
df = treat_outliers_all(df , df.iloc[: , 0::])
print("Outliers")
print(df)
#pip install haversine
import haversine as hs #Calculate the distance using Haversine to calculate the distance
between to points. Can't use Eucladian as it is for flat surface.
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 =
[df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropof
f_latitude'][pos]]
loc1=(lati1,long1)
loc2=(lati2,long2)
c = hs.haversine(loc1,loc2)
travel_dist.append(c)
#print(travel_dist)
df['dist_travel_km'] = travel_dist
print("Distance Calculated")
print(df)
#Function to find the correlation
corr = df.corr()
print("Correlation");
print(corr);
#Dividing the dataset into feature and target values
x=
df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_coun
t','hour','day','month','year','dayofweek','dist_travel_km']]
y = df['fare_amount']
#Dividing the dataset into training and testing dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)
#Linear Regression
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train,y_train)
#To find the linear intercept
print("#To find the linear intercept");
print(regression.intercept_)
#To find the linear coeeficient
print("#To find the linear coeeficient");
print(regression.coef_);
#To predict the target values
prediction = regression.predict(X_test)
print("Prediction")
print(prediction)
print("Y Test ")
print(y_test)
#Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared Error
from sklearn.metrics import r2_score
print("R2 Score")
print(r2_score(y_test,prediction))
print("MSE")
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
print(MSE)
print("RMSE")
RMSE = np.sqrt(MSE)
print(RMSE)
#Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
#Here n_estimators means number of trees you want to build before making the prediction
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(y_pred)
#Metrics evaluatin for Random Forest
R2_Random = r2_score(y_test,y_pred)
print("R2 Random")
print(R2_Random)
print("MSE_Random")
MSE_Random = mean_squared_error(y_test,y_pred)
print(MSE_Random)
print("RMSE")
RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)
Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
df=pd.read_csv('emails.csv')
print(df)
df.columns
df.isnull().sum()
df.dropna(inplace = True)
df.drop(['Email No.'],axis=1,inplace=True)
X = df.drop(['Prediction'],axis = 1)
y = df['Prediction']
from sklearn.preprocessing import scale
X = scale(X)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Prediction",y_pred)
print("KNN accuracy = ",metrics.accuracy_score(y_test,y_pred))
print("Confusion matrix",metrics.confusion_matrix(y_test,y_pred))
#SVM classifier
model = SVC(C = 1)
# fit
model.fit(X_train, y_train)
# predict
y_pred = model.predict(X_test)
metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
print("SVM accuracy = ",metrics.accuracy_score(y_test,y_pred))
Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt #Importing the libraries
# %% [code]
df = pd.read_csv("Churn_Modelling.csv")
# %% [markdown]
# Preprocessing.
# %% [code]
df.head()
df.shape
df.describe()
df.isnull()
df.isnull().sum()
df.info()
df.dtypes
df.columns
df = df.drop(['RowNumber', 'Surname', 'CustomerId'], axis= 1)
df.head()
def visualization(x, y, xlabel):
plt.figure(figsize=(10,5))
plt.hist([x, y], color=['red', 'green'], label = ['exit','not_exit'])
plt.xlabel(xlabel,fontsize=20)
plt.ylabel("No. of customers", fontsize=20)
plt.legend()
df_churn_exited = df[df['Exited']==1]['Tenure']
df_churn_not_exited = df[df['Exited']==0]['Tenure']
visualization(df_churn_exited, df_churn_not_exited, "Tenure")
df_churn_exited2 = df[df['Exited']==1]['Age']
df_churn_not_exited2 = df[df['Exited']==0]['Age']
visualization(df_churn_exited2, df_churn_not_exited2, "Age")
X=
df[['CreditScore','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMe
mber','EstimatedSalary']]
states = pd.get_dummies(df['Geography'],drop_first = True)
gender = pd.get_dummies(df['Gender'],drop_first = True)
df = pd.concat([df,gender,states], axis = 1)
df.head()
X=
df[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','Est
imatedSalary','Male','Germany','Spain']]
y = df['Exited']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
classifier.add(Dense(activation = "relu",input_dim = 11,units = 6,kernel_initializer =
"uniform"))
classifier.add(Dense(activation = "relu",units = 6,kernel_initializer = "uniform"))
classifier.add(Dense(activation = "sigmoid",units = 1,kernel_initializer = "uniform"))
classifier.compile(optimizer="adam",loss = 'binary_crossentropy',metrics = ['accuracy'])
classifier.summary()
classifier.fit(X_train,y_train,batch_size=10,epochs=50) #Fitting the ANN to training dataset
y_pred =classifier.predict(X_test)
y_pred = (y_pred > 0.5) #Predicti
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cm = confusion_matrix(y_test,y_pred)
cm
accuracy = accuracy_score(y_test,y_pred)
accuracy
plt.figure(figsize = (10,7))
sns.heatmap(cm,annot = True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
print(classification_report(y_test,y_pred))
Output:
Code:
cur_x = 2 # The algorithm starts at x=3
rate = 0.01 # Learning rate
precision = 0.000001 #This tells us when to stop the algorithm
previous_step_size = 1 #
max_iters = 10000 # maximum number of iterations
iters = 0 #iteration counter
df = lambda x: 2*(x+3) #Gradient of our function
while previous_step_size > precision and iters < max_iters:
prev_x = cur_x #Store current x value in prev_x
cur_x = cur_x - rate * df(prev_x) #Grad descent
previous_step_size = abs(cur_x - prev_x) #Change in x
iters = iters+1 #iteration count
print("Iteration",iters,"\nX value is",cur_x) #Print iterations
print("The local minimum occurs at", cur_x)
Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
df=pd.read_csv('diabetes.csv')
df.columns
df.isnull().sum()
X = df.drop('Outcome',axis = 1)
y = df['Outcome']
from sklearn.preprocessing import scale
X = scale(X)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Confusion matrix: ")
cs = metrics.confusion_matrix(y_test,y_pred)
print(cs)
print("Acccuracy ",metrics.accuracy_score(y_test,y_pred))
total_misclassified = cs[0,1] + cs[1,0]
print(total_misclassified)
total_examples = cs[0,0]+cs[0,1]+cs[1,0]+cs[1,1]
print(total_examples)
print("Error rate",total_misclassified/total_examples)
print("Error rate ",1-metrics.accuracy_score(y_test,y_pred))
print("Precision score",metrics.precision_score(y_test,y_pred))
print("Recall score ",metrics.recall_score(y_test,y_pred))
print("Classification report ",metrics.classification_report(y_test,y_pred))
Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Importing the required libraries.
from sklearn.cluster import KMeans, k_means #For clustering
from sklearn.decomposition import PCA #Linear Dimensionality reduction.
df = pd.read_csv("sales_data_sample.csv",encoding= 'unicode_escape') #Loading the dataset.
print(df)
df_drop = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS','POSTALCODE', 'CITY',
'TERRITORY', 'PHONE', 'STATE', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME',
'CUSTOMERNAME', 'ORDERNUMBER']
df = df.drop(df_drop, axis=1) #Dropping the categorical uneccessary columns along with
columns having null values. Can't fill the null values are there are alot of null values.
print(df)
# Checking the categorical columns.
df['COUNTRY'].unique()
df['PRODUCTLINE'].unique()
df['DEALSIZE'].unique()
productline = pd.get_dummies(df['PRODUCTLINE']) #Converting the categorical columns.
Dealsize = pd.get_dummies(df['DEALSIZE'])
df = pd.concat([df,productline,Dealsize], axis = 1)
df_drop = ['COUNTRY','PRODUCTLINE','DEALSIZE'] #Dropping Country too as there
are alot of countries.
df = df.drop(df_drop, axis=1)
df['PRODUCTCODE'] = pd.Categorical(df['PRODUCTCODE']).codes #Converting the
datatype.
df.drop('ORDERDATE', axis=1, inplace=True) #Dropping the Orderdate as Month is already
included.
df.dtypes #All the datatypes are converted into numeric
distortions = [] # Within Cluster Sum of Squares from the centroid
#Plotting the Elbow Plot to determine the number of clusters
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(df)
distortions.append(kmeanModel.inertia_) #Appeding the intertia to the Distortions
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
#As the number of k increases Inertia decreases.
#Observations: A Elbow can be observed at 3 and after that the curve decreases gradually.
X_train = df.values #Returns a numpy array.
X_train.shape
model = KMeans(n_clusters=3,random_state=2) #Number of cluster = 3
model = model.fit(X_train) #Fitting the values to create a model.
predictions = model.predict(X_train) #Predicting the cluster values (0,1,or 2)
unique,counts = np.unique(predictions,return_counts=True)
counts = counts.reshape(1,3)
counts_df = pd.DataFrame(counts,columns=['Cluster1','Cluster2','Cluster3'])
print(counts_df)
pca = PCA(n_components=2) #Converting all the features into 2 columns to make it easy to
visualize using Principal COmponent Analysis.
reduced_X = pd.DataFrame(pca.fit_transform(X_train),columns=['PCA1','PCA2']) #Creating
a DataFrame.
print(reduced_X)
#Plotting the normal Scatter Plot
plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.show()
model.cluster_centers_ #Finding the centriods. (3 Centriods in total. Each Array contains a
centroids for particular feature )
reduced_centers = pca.transform(model.cluster_centers_) #Transforming the centroids into 3
in x and y coordinates
reduced_centers
plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)
#Plotting the centriods
plt.show()
reduced_X['Clusters'] = predictions #Adding the Clusters to the reduced dataframe.
reduced_X.head()
#Plotting the clusters
plt.figure(figsize=(14,10))
# taking the cluster number and first column taking the same cluster
number and second column Assigning the color
plt.scatter(reduced_X[reduced_X['Clusters'] ==
0].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 0].loc[:,'PCA2'],color='slateblue')
plt.scatter(reduced_X[reduced_X['Clusters'] ==
1].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 1].loc[:,'PCA2'],color='springgreen')
plt.scatter(reduced_X[reduced_X['Clusters'] ==
2].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 2].loc[:,'PCA2'],color='indigo')
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)
plt.show()
Output: