0% found this document useful (0 votes)
10 views27 pages

Aiml

Uploaded by

Nitin Yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views27 pages

Aiml

Uploaded by

Nitin Yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 27

1

1 Importing Data and Data Visualization Task

[18]: import random


import pandas as pd
import matplotlib.pyplot as plt

[19]: data = {
'age': [random.randint(20,60) for _ in range (100)],
'gender': [random.choice(['Male','Female']) for _ in range (100)],
'income': [random.randint(20000,100000) for _ in range (100)],
}
df=pd.DataFrame(data)
df.to_csv('data.csv',index=False)

[20]: data = pd.read_csv('data.csv')


data.head()

[20]: age gender income


0 21 Female 42551
1 46 Male 73093
2 51 Female 98193
3 27 Female 29804
4 37 Male 52151

[10]: data.tail()

[10]: age gender income


95 46 Female 66391
96 39 Female 28883
97 54 Female 23822
98 31 Female 96601
99 48 Female 56378

[13]: plt.hist(data['age'], color = "red", bins = 20)


plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution')
plt.show()
2

[14]: plt.bar(data['gender'].unique(), data['gender'].value_counts(), color =␣


,→'pink')

plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Comparison')
plt.show()

[15]: plt.figure(figsize=(1,2))
plt.bar(data['gender'].unique(), data['gender'].value_counts(), width = 0.4)
3

plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Comparison')
plt.show()

[16]: plt.pie(data['gender'].value_counts(), labels=data['gender'].unique(),)


plt.title('Gender Proportion')
plt.show()
4

[17]: plt.scatter(data['age'], data['income'])


plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Age vs Income')
plt.show()
5

2 Data Cleaning and Pre-processing Task

[1]: import pandas as pd


import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from sklearn.preprocessing import LabelEncoder

[5]: data1 = {
'Age': [35, 41, 23, 32, 28, 36, 45, 39, 44, 29],
'Income': [70000, 90000, 50000, 60000, None, 75000, 100000, 80000, 95000,␣
,→55000],

'Date': ['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',␣


,→'2020-01-05', '2020-01-06', '2020-01-07', None, '2020-01-09', '2020-01-10'],

'Marital Status': ['Married', 'Single', 'Married', 'Single', 'Married',␣


,→'Single', 'Married', 'Single', 'Married', 'Single'],

'Gender': ['Female', 'Male', 'Male', 'Female', 'Female', 'Male',␣


,→'Female', 'Male', 'Male', 'Female']

df = pd.DataFrame(data1)
df.to_csv('data1.csv',index=False)

[7]: df=pd.read_csv('data1.csv')
print(df)

Age Income Date Marital Status Gender


0 35 70000.0 2020-01-01 Married Female
1 41 90000.0 2020-01-02 Single Male
2 23 50000.0 2020-01-03 Married Male
3 32 60000.0 2020-01-04 Single Female
4 28 NaN 2020-01-05 Married Female
5 36 75000.0 2020-01-06 Single Male
6 45 100000.0 2020-01-07 Married Female
7 39 80000.0 NaN Single Male
8 44 95000.0 2020-01-09 Married Male
9 29 55000.0 2020-01-10 Single Female
[8]: print(df.columns)
print(df.isnull().sum())
print(df)

Index(['Age', 'Income', 'Date', 'Marital Status', 'Gender'], dtype='object')


Age 0
Income 1
Date 1
Marital Status 0
Gender 0
dtype: int64
Age Income Date Marital Status Gender
0 35 70000.0 2020-01-01 Married Female
1 41 90000.0 2020-01-02 Single Male
2 23 50000.0 2020-01-03 Married Male
6

3 32 60000.0 2020-01-04 Single Female


4 28 NaN 2020-01-05 Married Female
5 36 75000.0 2020-01-06 Single Male
6 45 100000.0 2020-01-07 Married Female
7 39 80000.0 NaN Single Male
8 44 95000.0 2020-01-09 Married Male
9 29 55000.0 2020-01-10 Single Female
[9]: df['Income'].fillna(df['Income'].mean(),inplace=True)
print(df)

Age Income Date Marital Status Gender


0 35 70000.0 2020-01-01 Married Female
1 41 90000.0 2020-01-02 Single Male
2 23 50000.0 2020-01-03 Married Male
3 32 60000.0 2020-01-04 Single Female
4 28 75000.0 2020-01-05 Married Female
5 36 75000.0 2020-01-06 Single Male
6 45 100000.0 2020-01-07 Married Female
7 39 80000.0 NaN Single Male
8 44 95000.0 2020-01-09 Married Male
9 29 55000.0 2020-01-10 Single Female
[10]: df['Date'].fillna(method='ffill',inplace=True)
print(df)

Age Income Date Marital Status Gender


0 35 70000.0 2020-01-01 Married Female
1 41 90000.0 2020-01-02 Single Male
2 23 50000.0 2020-01-03 Married Male
3 32 60000.0 2020-01-04 Single Female
4 28 75000.0 2020-01-05 Married Female
5 36 75000.0 2020-01-06 Single Male
6 45 100000.0 2020-01-07 Married Female
7 39 80000.0 2020-01-07 Single Male
8 44 95000.0 2020-01-09 Married Male
9 29 55000.0 2020-01-10 Single Female
[11]: df.dropna(inplace=True)
print(df)

Age Income Date Marital Status Gender


0 35 70000.0 2020-01-01 Married Female
1 41 90000.0 2020-01-02 Single Male
2 23 50000.0 2020-01-03 Married Male
3 32 60000.0 2020-01-04 Single Female
4 28 75000.0 2020-01-05 Married Female
5 36 75000.0 2020-01-06 Single Male
6 45 100000.0 2020-01-07 Married Female
7 39 80000.0 2020-01-07 Single Male
8 44 95000.0 2020-01-09 Married Male
9 29 55000.0 2020-01-10 Single Female
7

[12]: df1 = df.drop('Gender',axis=1)


print(df1)

Age Income Date Marital Status


0 35 70000.0 2020-01-01 Married
1 41 90000.0 2020-01-02 Single
2 23 50000.0 2020-01-03 Married
3 32 60000.0 2020-01-04 Single
4 28 75000.0 2020-01-05 Married
5 36 75000.0 2020-01-06 Single
6 45 100000.0 2020-01-07 Married
7 39 80000.0 2020-01-07 Single
8 44 95000.0 2020-01-09 Married
9 29 55000.0 2020-01-10 Single
[13]: df1 = df1 .assign(age_squared=lambda x:x ['Age']**2)
print(df1)

Age Income Date Marital Status age_squared


0 35 70000.0 2020-01-01 Married 1225
1 41 90000.0 2020-01-02 Single 1681
2 23 50000.0 2020-01-03 Married 529
3 32 60000.0 2020-01-04 Single 1024
4 28 75000.0 2020-01-05 Married 784
5 36 75000.0 2020-01-06 Single 1296
6 45 100000.0 2020-01-07 Married 2025
7 39 80000.0 2020-01-07 Single 1521
8 44 95000.0 2020-01-09 Married 1936
9 29 55000.0 2020-01-10 Single 841
[14]: le= LabelEncoder()
df[['Marital Status','Gender']] = df[['Marital Status','Gender']].apply(le.
,→fit_transform)

df

[14]: Age Income Date Marital Status Gender


0 35 70000.0 2020-01-01 0 0
1 41 90000.0 2020-01-02 1 1
2 23 50000.0 2020-01-03 0 1
3 32 60000.0 2020-01-04 1 0
4 28 75000.0 2020-01-05 0 0
5 36 75000.0 2020-01-06 1 1
6 45 100000.0 2020-01-07 0 0
7 39 80000.0 2020-01-07 1 1
8 44 95000.0 2020-01-09 0 1
9 29 55000.0 2020-01-10 1 0

[15]: scaler = MinMaxScaler()


df1['Age'] = scaler.fit_transform(df1[['Age']])
df1
8

[15]: Age Income Date Marital Status age_squared


0 0.545455 70000.0 2020-01-01 Married 1225
1 0.818182 90000.0 2020-01-02 Single 1681
2 0.000000 50000.0 2020-01-03 Married 529
3 0.409091 60000.0 2020-01-04 Single 1024
4 0.227273 75000.0 2020-01-05 Married 784
5 0.590909 75000.0 2020-01-06 Single 1296
6 1.000000 100000.0 2020-01-07 Married 2025
7 0.727273 80000.0 2020-01-07 Single 1521
8 0.954545 95000.0 2020-01-09 Married 1936
9 0.272727 55000.0 2020-01-10 Single 841

[16]: scaler = StandardScaler()


df1['Age'] = scaler.fit_transform(df1[['Age']])
df1

[16]: Age Income Date Marital Status age_squared


0 -0.029123 70000.0 2020-01-01 Married 1225
1 0.844581 90000.0 2020-01-02 Single 1681
2 -1.776532 50000.0 2020-01-03 Married 529
3 -0.465975 60000.0 2020-01-04 Single 1024
4 -1.048445 75000.0 2020-01-05 Married 784
5 0.116494 75000.0 2020-01-06 Single 1296
6 1.427050 100000.0 2020-01-07 Married 2025
7 0.553346 80000.0 2020-01-07 Single 1521
8 1.281433 95000.0 2020-01-09 Married 1936
9 -0.902828 55000.0 2020-01-10 Single 841
9

3 Linear Regression

[1]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

[10]: df_sal = pd.read_csv("Position_Salaries.csv")


df_sal.head()

[10]: Position Level Salary


0 Business Analyst 1 45000
1 Junior Consultant 2 50000
2 Senior Consultant 3 60000
3 Manager 4 80000
4 Country Manager 5 110000

[11]: df_sal.describe()

[11]: Level Salary


count 10.00000 10.000000
mean 5.50000 249500.000000
std 3.02765 299373.883668
min 1.00000 45000.000000
25% 3.25000 65000.000000
50% 5.50000 130000.000000
75% 7.75000 275000.000000
max 10.00000 1000000.000000

[13]: plt.title('Salary Distribution Plot')


sns.distplot(df_sal['Salary'])
plt.show()
10

[14]: plt.scatter(df_sal['Level'], df_sal['Salary'], color='lightcoral')


plt.title('Salary vs level')
plt.xlabel('Level')
plt.ylabel('Salary')
plt.box(False)
plt.show()
11

[15]: X=df_sal.iloc[:,1:-1].values
y=df_sal.iloc[:,-1].values

[16]: pr= PolynomialFeatures(degree = 4)


X_poly = pr.fit_transform(X)
lr_2 = LinearRegression()
lr_2.fit(X_poly, y)
lr=LinearRegression()
lr.fit(X,y)

[16]: LinearRegression()

[17]: y_pred_lr = lr.predict(X)


y_pred_poly = lr_2.predict(X_poly)

[18]: plt.scatter(X,y,color = 'lightcoral')


plt.plot (X,lr.predict(X),color='firebrick')
plt.title('Real data (Linear Regression)')
plt.xlabel('Positional Level')
plt.ylabel('Salary')
plt.legend(['X/y_pred_lr','X/y'], title = 'Salary/Level', loc = 'best',␣
,→facecolor='white')

plt.box(False)
plt.show()
12

[20]: plt.scatter(X,y,color = 'lightcoral')


plt.plot (X,lr_2.predict(X_poly),color='firebrick')
plt.title('Real data (Linear Regression)')
plt.xlabel('Positional Level')
plt.ylabel('Salary')
plt.legend(['X/y_pred_lr','X/y'], title = 'Salary/Level', loc = 'best',␣
,→facecolor='white')

plt.box(False)
plt.show()
13
14

4 Multiple Linear Regression

[1]: from sklearn.linear_model import LinearRegression


import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

[3]: c1="advertising.csv"
df=pd.read_csv(c1)
df=pd.DataFrame(df)

[5]: df.shape

[5]: (200, 4)

[6]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
# Column Non-Null Count Dtype
0 TV 200 non-null float64
1 Radio 200 non-null float64
2 Newspaper 200 non-null float64
3 Sales 200 non-null float64
dtypes: float64(4)
memory usage: 6.4 KB
[7]: df.isnull().sum()

[7]: TV 0
Radio 0
Newspaper 0
Sales 0
dtype: int64

[8]: X=df.drop('Sales',axis=1)
y=df['Sales']

[9]: x_train, x_test,y_train, y_test = train_test_split(X,y,test_size = 0.


,→ 3,random_state = 41)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(140, 3)
15

(140,)
(60, 3)
(60,)
[10]: model = LinearRegression()
model.fit(x_train , y_train)

[10]: LinearRegression()

[11]: y_pred = model.predict(x_test)

[12]: plt.scatter(y_test, y_pred)

plt.xlabel('Actual Sales')
plt.ylabel('Prediceted Sales')
plt.title('Multile Linear Regression')
lims=[min(min(y_test),min(y_pred)), max(max(y_test), max(y_pred))]
plt.plot(lims, lims, 'k--')
plt.show

[12]:

5 Logistic Regression

[37]: from sklearn.linear_model import LinearRegression


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib as plt
16

[23]: np.random.seed(123)
n = 1000
age = np.random.randint(20,40,n)
gender = np.random.choice (['male','female'],n)
education = np.random.choice (['high school','college','graduate'],n)
job_level = np.random.choice (['junior','senior'],n)
last_evaluation = np.random.uniform (0.4, 1, n)
average_monthly_hours = np.random.randint (100, 300, n)
time_spend_company = np.random.randint (1, 10, n)
number_of_projects = np.random.randint (1, 7, n)
work_accident = np.random.randint (0, 1, n)
promotion = np.random.choice ([0, 1], n)
salary = np.random.choice (['low', 'medium', 'high'], n)

[24]: work_accident.shape

[24]: (1000,)

[25]: df = pd.DataFrame({
'age':age,
'gender':gender,
'education': education,
'job_level': job_level,
'last_evaluation':last_evaluation,
'average_monthly_hours': average_monthly_hours,
'time_spend_company' :time_spend_company,
'number_of_projects': number_of_projects,
'work_accident' : work_accident,
'promotion' : promotion,
'salary' :salary,})

#save DataFrame as csv file


df.to_csv('Employee.csv', index=False)

[26]: print(df.isnull().sum())

age 0
gender 0
education 0
job_level 0
last_evaluation 0
average_monthly_hours 0
time_spend_company 0
number_of_projects 0
work_accident 0
promotion 0
salary 0
dtype: int64
[27]: df = df.drop(['number_of_projects','gender', 'last_evaluation'],axis=1)
df
17

[27]: age education job_level average_monthly_hours time_spend_company \


0 33 college junior 174 3
1 22 graduate senior 198 6
2 22 high school senior 185 4
3 26 high school junior 245 9
4 37 college junior 272 8
.. ... ... ... ... ...
995 39 high school senior 224 8
996 32 high school senior 150 5
997 30 college senior 222 2
998 23 college senior 251 7
999 21 college junior 250 2

work_accident promotion salary


0 0 1 high
1 0 0 high
2 0 0 low
3 0 1 medium
4 0 0 medium
.. ... ... ...
995 0 0 low
996 0 1 low
997 0 1 low
998 0 1 high
999 0 1 medium

[1000 rows x 8 columns]

[28]: le= LabelEncoder()


df[['age','education','job_level','salary']] =␣
,→df[['age','education','job_level','salary']].apply(le.fit_transform)

df

[28]: age education job_level average_monthly_hours time_spend_company \


0 13 0 0 174 3
1 2 1 1 198 6
2 2 2 1 185 4
3 6 2 0 245 9
4 17 0 0 272 8
.. ... ... ... ... ...
995 19 2 1 224 8
996 12 2 1 150 5
997 10 0 1 222 2
998 3 0 1 251 7
999 1 0 0 250 2

work_accident promotion salary


0 0 1 0
1 0 0 0
2 0 0 1
3 0 1 2
18

4 0 0 2
.. ... ... ...
995 0 0 1
996 0 1 1
997 0 1 1
998 0 1 0
999 0 1 2

[1000 rows x 8 columns]

[29]: X=df.drop('promotion',axis=1)
y=df['promotion']

[30]: x_train, x_test,y_train, y_test = train_test_split(X,y,test_size = 0.


,→ 3,random_state = 41)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(700, 7)
(700,)
(300, 7)
(300,)
[31]: from sklearn.linear_model import LogisticRegression

[32]: model = LogisticRegression()


model.fit(x_train , y_train)

C:\ProgramData\anaconda3\Lib\site-
packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed
to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(
[32]: LogisticRegression()

[33]: y_pred = model.predict(x_test)

[34]: from sklearn.metrics import confusion_matrix, classification_report

[35]: print(classification_report(y_test, y_pred))

precision recall f1-score support

0 0.48 0.81 0.60 145


19

1 0.49 0.17 0.26 155

accuracy 0.48 300


macro avg 0.48 0.49 0.43 300
weighted avg 0.48 0.48 0.42 300

[38]: cm = confusion_matrix (y_test, y_pred)


ax = plt.pyplot.subplot()
sns.heatmap(cm,annot=True, ax=ax, cmap='Blues', fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

plt.pyplot.show()
20

6 Support Vector Machine Classifier

[22]: from sklearn.datasets import load_iris


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,␣
,→classification_report

[11]: iris=load_iris()
X=iris.data
y=iris.target

[12]: data = pd.DataFrame(iris["data"], columns=iris["feature_names"])

[13]: scaler = MinMaxScaler()


X_normalized = scaler.fit_transform(X)

[14]: X_train, X_test, y_train, y_test = train_test_split(X_normalized, y,␣


,→test_size=0.2, random_state=42)

[15]: iris=load_iris()
X=iris.data
y=iris.target

[19]: svm = SVC(kernel = 'linear',gamma = 'scale', shrinking = False)


svm.fit(X_train, y_train)

[19]: SVC(kernel='linear', shrinking=False)

[20]: y_pred = svm.predict(X_test)

[23]: con = confusion_matrix(y_test, y_pred)


print(con)

[[10 0 0]
[ 0 9 0]
[ 0 0 11]]
[24]: clas = classification_report(y_test, y_pred)
print(clas)

precision recall f1-score support


0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
21

7 Naïve Bayes Classifier

[3]: from sklearn.datasets import load_iris


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

[22]: iris=load_iris()
X=iris.data
y=iris.target

[23]: data = pd.DataFrame(iris["data"], columns=iris["feature_names"])

[24]: scaler = MinMaxScaler()


X_normalized = scaler.fit_transform(X)

[30]: X_train, X_test, y_train, y_test = train_test_split(X_normalized, y,␣


,→test_size=0.4, random_state=42)

[31]: gnb = GaussianNB()


gnb.fit(X_train, y_train)

[31]: GaussianNB()

[32]: y_pred = gnb.predict(X_test)

[33]: con = confusion_matrix(y_test, y_pred)


print(con)

[[23 0 0]
[ 0 18 1]
[ 0 1 17]]
[34]: clas = classification_report(y_test, y_pred)
print(clas)

precision recall f1-score support


0 1.00 1.00 1.00 23
1 0.95 0.95 0.95 19
2 0.94 0.94 0.94 18
accuracy 0.97 60
macro avg 0.96 0.96 0.96 60
weighted avg 0.97 0.97 0.97 60
22

8 K-Nearest Neighbour Classifier

[13]: from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

[14]: iris=load_iris()
X=iris.data
y=iris.target

[15]: data = pd.DataFrame(iris["data"], columns=iris["feature_names"])

[16]: scaler = MinMaxScaler()


X_normalized = scaler.fit_transform(X)

[58]: X_train, X_test, y_train, y_test = train_test_split(X_normalized, y,␣


,→test_size=0.2, random_state=42)

[59]: from sklearn.datasets import load_iris


from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,␣
,→classification_report

from sklearn.model_selection import GridSearchCV

[60]: iris=load_iris()
X=iris.data
y=iris.target

[61]: data = pd.DataFrame(iris["data"], columns=iris["feature_names"])

[62]: scaler = MinMaxScaler()


X_normalized = scaler.fit_transform(X)

[63]: X_train, X_test, y_train, y_test = train_test_split(X_normalized, y,␣


,→test_size=0.2, random_state=42)

[64]: knn = KNeighborsClassifier(n_neighbors=7)


knn.fit(X_train, y_train)

[64]: KNeighborsClassifier(n_neighbors=7)

[65]: y_pred = knn.predict(X_test)

[66]: con = confusion_matrix(y_test, y_pred)


print(con)
23

[[10 0 0]
[ 0 9 0]
[ 0 0 11]]
[67]: clas = classification_report(y_test, y_pred)
print(clas)

precision recall f1-score support

0 1.00 1.00 1.00 10


1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11

accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
24

9 Decision Tree Classifier


[1]: from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score,confusion_matrix, classification_report

[2]: iris=load_iris()
X=iris.data
y=iris.target

[3]: data = pd.DataFrame(iris["data"], columns=iris["feature_names"])

[4]: scaler = MinMaxScaler()


X_normalized = scaler.fit_transform(X)

[10]: X_train, X_test, y_train, y_test = train_test_split(X_normalized, y,␣


,→test_size=0.4, random_state=42)

[11]: clf = DecisionTreeClassifier()


clf.fit(X_train, y_train)

[11]: DecisionTreeClassifier()

[12]: y_pred = clf.predict(X_test)

[13]: con = confusion_matrix(y_test, y_pred)


print(con)

[[23 0 0]
[ 0 19 0]
[ 0 0 18]]
[14]: clas = classification_report(y_test, y_pred)
print(clas)

precision recall f1-score support


0 1.00 1.00 1.00 23
1 1.00 1.00 1.00 19
2 1.00 1.00 1.00 18
accuracy 1.00 60
macro avg 1.00 1.00 1.00 60
weighted avg 1.00 1.00 1.00 60
25

10 Clustering

[18]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

[19]: df=pd.read_csv('Mall_Customers.csv')

[20]: df.columns

[20]: Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',


'Spending Score (1-100)'],
dtype='object')

[21]: df.head()

[21]: CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40

[22]: df.drop(['CustomerID','Genre','Age'], axis = 1, inplace=True)

[23]: df.head()

[23]: Annual Income (k$) Spending Score (1-100)


0 15 39
1 15 81
2 16 6
3 16 77
4 17 40

[24]: from sklearn.cluster import KMeans


wcss_list= []
for i in range(1,11):
kmeans=KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(df)
wcss_list.append(kmeans.inertia_)
plt.plot(range(1,11), wcss_list)
plt.title('The elbow Method Graph')
plt.xlabel('Number of clusters(k)')
plt.ylabel('wcss_list')
plt.show()
26

[25]: kmeans=KMeans(n_clusters=5,init ='k-means++',random_state=42)


y_predict=kmeans.fit_predict(df)

[26]: # Re-fit the KMeans model with 5 clusters if needed


kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_predict = kmeans.fit_predict(df)

# Plotting the clusters


plt.scatter(df.values[y_predict == 0, 0], df.values[y_predict == 0, 1], s =␣
,→100, c = 'blue', label = 'Cluster 1')

plt.scatter(df.values[y_predict == 1, 0], df.values[y_predict == 1, 1], s =␣


,→100, c = 'green', label = 'Cluster 2')

plt.scatter(df.values[y_predict == 2, 0], df.values[y_predict == 2, 1], s =␣


,→100, c = 'red', label = 'Cluster 3')

plt.scatter(df.values[y_predict == 3, 0], df.values[y_predict == 3, 1], s =␣


,→100, c = 'cyan', label = 'Cluster 4')

plt.scatter(df.values[y_predict == 4, 0], df.values[y_predict == 4, 1], s =␣


,→100, c = 'magenta', label = 'Cluster 5')

# Plotting the centroids


plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s =␣
,→300, c = 'yellow', label = 'Centroid')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
27

plt.ylabel('Spending Score (1-100)')


plt.legend()
plt.show()

You might also like