K Means

K-MEANS
EJECUCION DE CODIGO
Python 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)] ::
Anaconda, Inc. on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas as pd
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> import seaborn as sb
>>> from sklearn.cluster import KMeans
>>> from sklearn.metrics import pairwise_distan
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ImportError: cannot import name 'pairwise_distan' from 'sklearn.metrics'

(C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\__init__.py)
>>> from mpl_toolkits.mplot3d import Axes3D
>>> #realiza el dibujo del plano
>>> plt.rcParams['figure.figsize'] = (16, 9)

>>> plt.style.use('ggplot')
>>> dataframe = pd.read_csv("c:/datos/analisis.csv")
>>> dataframe.head()
usuario op co ex ag ne wordcount categoria
0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575

37.0945 7
1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098

10.362406 78.7970 7
2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979

49.2604 4
3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231

80.4538 2
4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968

47.0645 4
>>> dataframe.describe()
op co ex ag ne wordcount categoria
count 140.000000 140.000000 140.000000 140.000000 140.000000

140.000000 140.000000
mean 44.414591 22.977135 40.764428 22.918528 8.000098

98.715484 4.050000
std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071

2.658839
min 30.020465 7.852756 18.693542 9.305985 1.030213 5.020800

1.000000
25% 38.206484 19.740299 36.095722 17.050993 6.086144

66.218475 2.000000
50% 44.507091 22.466718 41.457492 21.384554 7.839722

94.711400 3.500000
75% 49.365923 26.091606 45.197769 28.678867 9.758189

119.707925 7.000000
max 71.696129 49.637863 59.824844 40.583162 23.978462

217.183200 9.000000
>>> print(dataframe.groupby('categoria').size())
categoria
1 27
2 34
3 9
4 19
5 4
6 8
7 17
8 16
9 6
dtype: int64
>>> from mpl_toolkits.mplot3d import Axes3D
>>> plt.rcParams['figure.figsize'] = (16, 9)
>>> plt.style.use('ggplot')
>>> dataframe = pd.read_csv("c:/datos/analisis.csv")
>>> dataframe.head()
usuario op co ex ag ne wordcount categoria
0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575

37.0945 7
1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098

10.362406 78.7970 7
2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979

49.2604 4
3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231

80.4538 2
4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968

47.0645 4
>>> dataframe.describe()
op co ex ag ne wordcount categoria
count 140.000000 140.000000 140.000000 140.000000 140.000000

140.000000 140.000000
mean 44.414591 22.977135 40.764428 22.918528 8.000098

98.715484 4.050000
std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071

2.658839
min 30.020465 7.852756 18.693542 9.305985 1.030213 5.020800
1.000000
25% 38.206484 19.740299 36.095722 17.050993 6.086144

66.218475 2.000000
50% 44.507091 22.466718 41.457492 21.384554 7.839722

94.711400 3.500000
75% 49.365923 26.091606 45.197769 28.678867 9.758189

119.707925 7.000000
max 71.696129 49.637863 59.824844 40.583162 23.978462

217.183200 9.000000
>>> print(dataframe.groupby('categoria').size())
categoria
1 27
2 34
3 9
4 19
5 4
6 8
7 17
8 16
9 6
dtype: int64
>>> dataframe.drop(['categoria'],1).hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D7675F6288>,
<matplotlib.axes._subplots.AxesSubplot object at
0x000001D7695E40C8>],
[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D769619FC8>,
0x000001D769658148>],
[<matplotlib.axes._subplots.AxesSubplot object at
0x000001D769690248>,
0x000001D7696C7348>]],
dtype=object)
>>> plt.show()
>>>
sb.pairplot(dataframe.dropna(),hue='categoria',size=4,vars=["op","ex","ag"],
kind='scatter')
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:2079:
UserWarning: The `size` parameter has been renamed to `height`; please
update your code.
warnings.warn(msg, UserWarning)
<seaborn.axisgrid.PairGrid object at 0x000001D76AD9A848>
>>> plt.show()
>>> X = np.array(dataframe[["op","ex","ag"]])
>>> y = np.array(dataframe['categoria'])
>>> X.shape
(140, 3)
>>> fig = plt.figure()
>>> ax = Axes3D(fig)
>>>
colores=['blue','red','green','blue','cyan','yellow','orange','black','pink','brown'
,'purple']
>>> asignar=[]
>>> for row in y:asignar.append(colores[row])
... ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)
File "<stdin>", line 2
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)
SyntaxError: invalid syntax
>>> fig.show()
>>> Nc = range(1, 20)
>>> kmeans = [KMeans(n_clusters=i) for i in Nc]
>>> kmeans
[KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=1, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',

copy_x=True, init='k-means++', max_iter=300,

















random_state=None, tol=0.0001, verbose=0)]
>>> score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]
>>> score
[-25194.039352766275, -12632.031536945231, -9971.470486889943,

-8343.956647749546, -6960.8397772049575, -6094.510898323235,
-5479.9524609810205, -4984.410638822017, -4462.561746198897,
-3966.3563403226008, -3626.8497478032577, -3430.2619254837846,
-3191.634629726945, -2984.573579366569, -2858.1679660936406,
-2662.582439309178, -2476.1049636437565, -2376.5702202311627,
-2283.6007323083313]
>>> plt.plot(Nc,score)
[<mpl_toolkits.mplot3d.art3d.Line3D object at 0x000001D7696B36C8>]
>>> plt.xlabel('Number of Clusters')

Text(0.5, 0, 'Number of Clusters')
>>> plt.ylabel('Score')
Text(0.5, 0, 'Score')
>>> plt.title('Elbow Curve')
Text(0.5, 0.92, 'Elbow Curve')
>>> plt.show()
>>> kmeans = KMeans(n_clusters=5).fit(X)
>>> centroids = kmeans.cluster_centers_
>>> print(centroids)
[[35.90241306 47.56828232 33.58748762]
[42.968253 32.53013537 20.93305995]
[39.98518668 43.35386426 23.32930742]
[50.26472539 40.722464 17.31345388]
[58.70462307 30.53566167 15.72207033]]
>>> labels = kmeans.predict(X)
>>> C = kmeans.cluster_centers_
>>> colores=['red','green','blue','cyan','yellow']
>>> asignar=[]
>>> for row in labels:asignar.append(colores[row])
... fig = plt.figure()
File "<stdin>", line 2
fig = plt.figure()
SyntaxError: invalid syntax
>>> ax = Axes3D(fig)
>>> ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)
<mpl_toolkits.mplot3d.art3d.Path3DCollection object at
0x000001D768228748>
>>> ax.scatter(C[:, 0], C[:, 1], C[:, 2], marker='*', c=colores, s=1000)
<mpl_toolkits.mplot3d.art3d.Path3DCollection object at
0x000001D76835EE08>
>>> fig.show()
>>> f1 = dataframe['op'].values
>>> f2 = dataframe['ex'].values
>>> plt.scatter(f1, f2, c=asignar, s=70)
<matplotlib.collections.PathCollection object at 0x000001D769785B48>
>>> plt.scatter(C[:, 0], C[:, 1], marker='*', c=colores, s=1000)
<matplotlib.collections.PathCollection object at 0x000001D76972A9C8>
>>> plt.show()
>>> copy = pd.DataFrame()
>>> copy['usuario']=dataframe['usuario'].values
>>> copy['categoria']=dataframe['categoria'].values
>>> copy['label'] = labels;
>>> cantidadGrupo = pd.DataFrame()
>>> cantidadGrupo['color']=colores
>>> cantidadGrupo['cantidad']=copy.groupby('label').size()
>>> cantidadGrupo
color cantidad
0 red 34
1 green 19
2 blue 31
3 cyan 41
4 yellow 15
>>> copy = pd.DataFrame()
>>> copy['usuario']=dataframe['usuario'].values
>>> copy['categoria']=dataframe['categoria'].values
>>> copy['label'] = labels;
>>> cantidadGrupo = pd.DataFrame()
>>> cantidadGrupo['color']=colores
>>> cantidadGrupo['cantidad']=copy.groupby('label').size()
>>> cantidadGrupo
color cantidad
0 red 34
1 green 19
2 blue 31
3 cyan 41
4 yellow 15
>>> group_referrer_index = copy['label'] ==0
>>> group_referrals = copy[group_referrer_index]
>>> diversidadGrupo = pd.DataFrame()
>>> diversidadGrupo['categoria']=[0,1,2,3,4,5,6,7,8,9]
>>> diversidadGrupo['cantidad']=group_referrals.groupby('categoria').size()
>>> diversidadGrupo
categoria cantidad
0 0 NaN
1 1 3.0
2 2 4.0
3 3 NaN
4 4 12.0
5 5 3.0
6 6 1.0
7 7 5.0
8 8 3.0
9 9 3.0
>>> closest, _= pairwise_distances_argmin_min(kmeans.cluster_centers_, x)
NameError: name 'pairwise_distances_argmin_min' is not defined
>>> Closest

NameError: name 'Closest' is not defined
>>> users=dataframe['usuario'].values
>>> for row in closest:print(users[row])

K Means

Uploaded by

Copyright:

Available Formats

K Means

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

K Means

Uploaded by

Copyright:

Available Formats

K-MEANS

Type "help", "copyright", "credits" or "license" for more information.

>>> import pandas as pd

>>> import numpy as np

>>> import matplotlib.pyplot as plt

>>> import seaborn as sb

>>> from sklearn.cluster import KMeans

>>> from sklearn.metrics import pairwise_distan

Traceback (most recent call last):

File "<stdin>", line 1, in <module>

ImportError: cannot import name 'pairwise_distan' from 'sklearn.metrics'

>>> from mpl_toolkits.mplot3d import Axes3D

>>> #realiza el dibujo del plano

>>> plt.rcParams['figure.figsize'] = (16, 9)

>>> dataframe = pd.read_csv("c:/datos/analisis.csv")

usuario op co ex ag ne wordcount categoria

0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575

1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098

2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979

3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231

4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968

count 140.000000 140.000000 140.000000 140.000000 140.000000

mean 44.414591 22.977135 40.764428 22.918528 8.000098

std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071

min 30.020465 7.852756 18.693542 9.305985 1.030213 5.020800

25% 38.206484 19.740299 36.095722 17.050993 6.086144

50% 44.507091 22.466718 41.457492 21.384554 7.839722

75% 49.365923 26.091606 45.197769 28.678867 9.758189

max 71.696129 49.637863 59.824844 40.583162 23.978462

>>> from mpl_toolkits.mplot3d import Axes3D

>>> plt.rcParams['figure.figsize'] = (16, 9)

>>> dataframe = pd.read_csv("c:/datos/analisis.csv")

usuario op co ex ag ne wordcount categoria

0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575

1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098

2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979

3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231

4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968

count 140.000000 140.000000 140.000000 140.000000 140.000000

mean 44.414591 22.977135 40.764428 22.918528 8.000098

std 8.425723 5.816851 7.185246 7.657122 3.039248 44.714071

25% 38.206484 19.740299 36.095722 17.050993 6.086144

50% 44.507091 22.466718 41.457492 21.384554 7.839722

75% 49.365923 26.091606 45.197769 28.678867 9.758189

max 71.696129 49.637863 59.824844 40.583162 23.978462

<seaborn.axisgrid.PairGrid object at 0x000001D76AD9A848>

>>> fig = plt.figure()

>>> for row in y:asignar.append(colores[row])

... ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)

File "<stdin>", line 2

ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=asignar,s=60)

SyntaxError: invalid syntax

>>> Nc = range(1, 20)

>>> kmeans = [KMeans(n_clusters=i) for i in Nc]

n_clusters=1, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',

n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',

random_state=None, tol=0.0001, verbose=0), KMeans(algorithm='auto',

n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',