K Means on IRIS dataset
#Suppress warnings
import warnings
warnings.filterwarnings('ignore')
#Importing Libraries
import numpy as np
import pandas as pd
#Creating a dataframe
data=pd.read_csv("IRIS.csv")
data.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal_length 150 non-null float64
1 sepal_width 150 non-null float64
2 petal_length 150 non-null float64
3 petal_width 150 non-null float64
4 species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
data.describe
<bound method NDFrame.describe of sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
.. ... ... ... ... ...
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica
[150 rows x 5 columns]>
Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Histogram
plt.hist(data["sepal_length"],bins=10,color="green")
plt.xlabel("sepla_length")
plt.ylabel("petal_length")
plt.show()
#Scatter Plot
sns.pairplot(data, hue="species")
plt.show()
#Histogrm with Line Graph
sns.histplot(data,x="petal_length",bins=25,kde=True,hue="species")
plt.xlabel("petal_length")
plt.ylabel("count")
plt.title("Petal Length distribution")
plt.show()
df1=data.drop(['species'],axis=1)
sns.heatmap(df1.corr(),annot=True)
plt.show()
Mapping the Species into number from 0 to 2
flower_mapping = {'Iris-setosa': 0,'Iris-versicolor': 1,'Iris-virginica':2}
data["species"] = data["species"].map(flower_mapping)
KMeans
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(data[['petal_length','petal_width']])
▾ KMeans
KMeans(n_clusters=3)
kmeans.cluster_centers_
array([[5.59583333, 2.0375 ],
[1.464 , 0.244 ],
[4.26923077, 1.34230769]])
plt.scatter(data['petal_length'],data['petal_width'],c=data['species'], cmap='rainbow')
plt.scatter(1.464, 0.244, s=200, c='b', marker='s')
plt.scatter(5.59583333, 2.0375, s=200, c='r', marker='s')
plt.scatter(4.26923077, 1.34230769, s=200, c='g', marker='s')
plt.show()
# Compute cluster centers and predict cluster index for each sample.
pred = kmeans.predict(data[['petal_length','petal_width']])
pred
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
sample_test=np.array([6,2])
second_test=sample_test.reshape(1, -1)
kmeans.predict(second_test)
array([0])
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js