K Means

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 15

K-MEANS

EJECUCION DE CODIGO

Python 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)] ::
Anaconda, Inc. on win32

Type "help", "copyright", "credits" or "license" for more information.

>>> import pandas as pd

>>> import numpy as np

>>> import matplotlib.pyplot as plt

>>> import seaborn as sb

>>> from sklearn.cluster import KMeans

>>> from sklearn.metrics import pairwise_distan

Traceback (most recent call last):

File "<stdin>", line 1, in <module>

ImportError: cannot import name 'pairwise_distan' from 'sklearn.metrics'


(C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\__init__.py)

>>> from mpl_toolkits.mplot3d import Axes3D

>>> #realiza el dibujo del plano

>>> plt.rcParams['figure.figsize'] = (16, 9)


>>> plt.style.use('ggplot')

>>> dataframe = pd.read_csv("c:/datos/analisis.csv")

>>> dataframe.head()

usuario op co ex ag ne wordcount categoria

0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575


37.0945 7

1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098


10.362406 78.7970 7

2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979


49.2604 4

3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231


80.4538 2

4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968


47.0645 4

>>> dataframe.describe()

op co ex ag ne wordcount categoria

count 140.000000 140.000000 140.000000 140.000000 140.000000


140.000000 140.000000

mean 44.414591 22.977135 40.764428 22.918528 8.000098


98.715484 4.050000

std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071


2.658839

min 30.020465 7.852756 18.693542 9.305985 1.030213 5.020800


1.000000

25% 38.206484 19.740299 36.095722 17.050993 6.086144


66.218475 2.000000

50% 44.507091 22.466718 41.457492 21.384554 7.839722


94.711400 3.500000

75% 49.365923 26.091606 45.197769 28.678867 9.758189


119.707925 7.000000

max 71.696129 49.637863 59.824844 40.583162 23.978462


217.183200 9.000000

>>> print(dataframe.groupby('categoria').size())

categoria

1 27
2 34

3 9

4 19

5 4

6 8

7 17

8 16

9 6

dtype: int64

>>> from mpl_toolkits.mplot3d import Axes3D

>>> plt.rcParams['figure.figsize'] = (16, 9)

>>> plt.style.use('ggplot')

>>> dataframe = pd.read_csv("c:/datos/analisis.csv")

>>> dataframe.head()

usuario op co ex ag ne wordcount categoria

0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575


37.0945 7

1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098


10.362406 78.7970 7

2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979


49.2604 4

3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231


80.4538 2

4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968


47.0645 4

>>> dataframe.describe()

op co ex ag ne wordcount categoria

count 140.000000 140.000000 140.000000 140.000000 140.000000


140.000000 140.000000

mean 44.414591 22.977135 40.764428 22.918528 8.000098


98.715484 4.050000

std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071


2.658839
min 30.020465 7.852756 18.693542 9.305985 1.030213 5.020800
1.000000

25% 38.206484 19.740299 36.095722 17.050993 6.086144


66.218475 2.000000

50% 44.507091 22.466718 41.457492 21.384554 7.839722


94.711400 3.500000

75% 49.365923 26.091606 45.197769 28.678867 9.758189


119.707925 7.000000

max 71.696129 49.637863 59.824844 40.583162 23.978462


217.183200 9.000000

>>> print(dataframe.groupby('categoria').size())

categoria

1 27

2 34

3 9

4 19

5 4

6 8

7 17

8 16

9 6

dtype: int64

>>> dataframe.drop(['categoria'],1).hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D7675F6288>,

<matplotlib.axes._subplots.AxesSubplot object at
0x000001D7695E40C8>],

[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D769619FC8>,

<matplotlib.axes._subplots.AxesSubplot object at
0x000001D769658148>],

[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D769690248>,
<matplotlib.axes._subplots.AxesSubplot object at
0x000001D7696C7348>]],

dtype=object)

>>> plt.show()

>>>
sb.pairplot(dataframe.dropna(),hue='categoria',size=4,vars=["op","ex","ag"],
kind='scatter')

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:2079:
UserWarning: The `size` parameter has been renamed to `height`; please
update your code.

warnings.warn(msg, UserWarning)

<seaborn.axisgrid.PairGrid object at 0x000001D76AD9A848>

>>> plt.show()

>>> X = np.array(dataframe[["op","ex","ag"]])

>>> y = np.array(dataframe['categoria'])

>>> X.shape

(140, 3)

>>> fig = plt.figure()

>>> ax = Axes3D(fig)

>>>
colores=['blue','red','green','blue','cyan','yellow','orange','black','pink','brown'
,'purple']

>>> asignar=[]

>>> for row in y:asignar.append(colores[row])

... ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)

File "<stdin>", line 2

ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)

SyntaxError: invalid syntax

>>> fig.show()

>>> Nc = range(1, 20)

>>> kmeans = [KMeans(n_clusters=i) for i in Nc]

>>> kmeans
[KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,

n_clusters=1, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=7, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=11, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,
n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=13, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=14, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=15, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=16, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=17, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=18, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',


copy_x=True, init='k-means++', max_iter=300,

n_clusters=19, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0)]

>>> score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]

>>> score

[-25194.039352766275, -12632.031536945231, -9971.470486889943,


-8343.956647749546, -6960.8397772049575, -6094.510898323235,
-5479.9524609810205, -4984.410638822017, -4462.561746198897,
-3966.3563403226008, -3626.8497478032577, -3430.2619254837846,
-3191.634629726945, -2984.573579366569, -2858.1679660936406,
-2662.582439309178, -2476.1049636437565, -2376.5702202311627,
-2283.6007323083313]

>>> plt.plot(Nc,score)

[<mpl_toolkits.mplot3d.art3d.Line3D object at 0x000001D7696B36C8>]

>>> plt.xlabel('Number of Clusters')


Text(0.5, 0, 'Number of Clusters')

>>> plt.ylabel('Score')

Text(0.5, 0, 'Score')

>>> plt.title('Elbow Curve')

Text(0.5, 0.92, 'Elbow Curve')

>>> plt.show()

>>> kmeans = KMeans(n_clusters=5).fit(X)

>>> centroids = kmeans.cluster_centers_

>>> print(centroids)

[[35.90241306 47.56828232 33.58748762]

[42.968253 32.53013537 20.93305995]

[39.98518668 43.35386426 23.32930742]

[50.26472539 40.722464 17.31345388]

[58.70462307 30.53566167 15.72207033]]

>>> labels = kmeans.predict(X)

>>> C = kmeans.cluster_centers_

>>> colores=['red','green','blue','cyan','yellow']

>>> asignar=[]

>>> for row in labels:asignar.append(colores[row])

... fig = plt.figure()

File "<stdin>", line 2

fig = plt.figure()

SyntaxError: invalid syntax

>>> ax = Axes3D(fig)

>>> ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)

<mpl_toolkits.mplot3d.art3d.Path3DCollection object at
0x000001D768228748>

>>> ax.scatter(C[:, 0], C[:, 1], C[:, 2], marker='*', c=colores, s=1000)

<mpl_toolkits.mplot3d.art3d.Path3DCollection object at
0x000001D76835EE08>
>>> fig.show()

>>> f1 = dataframe['op'].values

>>> f2 = dataframe['ex'].values

>>> plt.scatter(f1, f2, c=asignar, s=70)

<matplotlib.collections.PathCollection object at 0x000001D769785B48>

>>> plt.scatter(C[:, 0], C[:, 1], marker='*', c=colores, s=1000)

<matplotlib.collections.PathCollection object at 0x000001D76972A9C8>

>>> plt.show()

>>> copy = pd.DataFrame()

>>> copy['usuario']=dataframe['usuario'].values

>>> copy['categoria']=dataframe['categoria'].values

>>> copy['label'] = labels;

>>> cantidadGrupo = pd.DataFrame()

>>> cantidadGrupo['color']=colores

>>> cantidadGrupo['cantidad']=copy.groupby('label').size()

>>> cantidadGrupo

color cantidad

0 red 34

1 green 19

2 blue 31

3 cyan 41

4 yellow 15

>>> copy = pd.DataFrame()

>>> copy['usuario']=dataframe['usuario'].values

>>> copy['categoria']=dataframe['categoria'].values

>>> copy['label'] = labels;

>>> cantidadGrupo = pd.DataFrame()

>>> cantidadGrupo['color']=colores

>>> cantidadGrupo['cantidad']=copy.groupby('label').size()

>>> cantidadGrupo
color cantidad

0 red 34

1 green 19

2 blue 31

3 cyan 41

4 yellow 15

>>> group_referrer_index = copy['label'] ==0

>>> group_referrals = copy[group_referrer_index]

>>> diversidadGrupo = pd.DataFrame()

>>> diversidadGrupo['categoria']=[0,1,2,3,4,5,6,7,8,9]

>>> diversidadGrupo['cantidad']=group_referrals.groupby('categoria').size()

>>> diversidadGrupo

categoria cantidad

0 0 NaN

1 1 3.0

2 2 4.0

3 3 NaN

4 4 12.0

5 5 3.0

6 6 1.0

7 7 5.0

8 8 3.0

9 9 3.0

>>> closest, _= pairwise_distances_argmin_min(kmeans.cluster_centers_, x)

Traceback (most recent call last):

File "<stdin>", line 1, in <module>

NameError: name 'pairwise_distances_argmin_min' is not defined

>>> Closest

Traceback (most recent call last):

File "<stdin>", line 1, in <module>


NameError: name 'Closest' is not defined

>>> users=dataframe['usuario'].values

>>> for row in closest:print(users[row])

You might also like