Lampiran 1 Pseudocode COATES Algorithm Dengan Menggunakan Software Python
Lampiran 1 Pseudocode COATES Algorithm Dengan Menggunakan Software Python
30
def getIndeks(X,Similar,TFIDF):
indeks=[]
for i in range(0,len(TFIDF)):
if X == 0:
indeks.append(Similar[i].index(min(Similar[i])))
else:
indeks.append(Similar[i].index(max(Similar[i])))
return indeks
def readDocument(a,b):
doc=[]
for i in range(a,b):
wordList = [str(i+1),".txt"]
sentence = ""
for i in wordList:
sentence += i
temp2=open(sentence, "r").read()
doc.append(temp2)
return doc
def PreProcessing(corpus):
doc=[];token=[];
stemmer=PorterStemmer()
stop_words=stopwords.words('english')+list(string.punctuation)
count=0
for i in corpus:
count = count+1
temp=word_tokenize(i)#Tokenizing
for j in temp:
if j not in stop_words: #Hilangkan Stop Words
token.append(stemmer.stem(j)) #Stemming&CaseFolding
doc.append(token)
token=[]
return doc
def getDictionary(doc):
dictionary=[]
for i in doc:
for j in i:
if not j in dictionary: #Hilangkan Duplikasi
dictionary.append(j)
31
return dictionary
def getNewCentroid(indeks,TFIDF,K):
jmlclus=[]
for i in range(0,K):
jmlclus.append(indeks.count(i))
idx=[]
for i in range(0,len(indeks)):
idx.append(indeks[i])
temp=[0]*len(TFIDF[0])
centroid=[]
for i in range(0,K):
for j in range(0,jmlclus[i]):
temp=[x + y for x, y in zip(temp,TFIDF[idx.index(i)][:])]
idx[idx.index(i)]=-1
print(' Finding Centroid',i+1,end='')
if jmlclus[i]!=0:
temp2=[z/jmlclus[i] for z in temp]
centroid.append(temp2)
temp=[0]*len(TFIDF[0])
print('...Done!')
return centroid
def printCluster(indeks,K):
print('')
for i in range(0,K):
print('CLUSTER',i+1,':',end='')
for j in range(0,len(indeks)):
if indeks[j] == i:
print(' Doc',j+1,end='')
print('')
print('')
def printme(A,B):
if B == 1:
print(' [',A[0],A[1],A[2],'...',A[len(A)-1],']')
print('')
else:
print(' [',A[0][0],A[0][1],A[0][2],'...',A[0][len(A[0])-1],']')
for i in range(0,3): print(' .')
32
print(' [',A[0][B-1],A[1][B-1],A[2][B-1],'...',A[B-1][len(A[0])-1],']\n')
rawcont = readDocument(0,50)
rawmeta = readDocument(50,100)
print('==============')
print('Term Weightned')
print('==============')
print('1.Pre-Processing',end='')
content = PreProcessing(rawcont)
auxiliary = PreProcessing(rawmeta)
print('...Done')
print('2.Finding Dictionary')
dictcont = getDictionary(content)
print(' Content Dictionary = Vector 1 x',len(dictcont))
printme(A=dictcont,B=1)
dictaux = getDictionary(auxiliary)
print(' Auxiliary Dictionary = Vector 1 x',len(dictaux))
printme(A=dictaux,B=1)
33
if TF[j][i]>0:
count=count+1
Freq.append(count)
count=0
print('...Done')
print(' DF = Vector 1 x',len(Freq))
printme(A=Freq,B=1)
print('6.Finding Weight(TF*IDF)',end='')
TFIDF=[];temp=[]
for i in range(0,len(TF)):
for j in range(0,len(TF[i])):
temp.append(TF[i][j]*IDF[j])
TFIDF.append(temp)
temp=[]
print('...Done')
print(' TFIDF = Matrix',len(TFIDF),'x',len(TFIDF[0]))
printme(A=TFIDF,B=len(TFIDF))
print('\n=====================================')
print('Content Based Algorithm Using K-Means')
print('=====================================')
Iterasi=1
print('ITERATION ',Iterasi)
K=4
print('K =',K)
34
for i in range(0,K):
centroid.append(TFIDF[temp[i]-1])
print('...Done')
ulang=True
Iterasi=Iterasi+1
while(ulang==True):
print('\nITERATION ',Iterasi)
if indeksKmeans1 == indeksKmeans2:
print('\nAnggota Cluster Tidak Berubah')
print('Proses Berhenti')
printCluster(indeksKmeans2,K)
ulang=False
else:
print('Anggota Cluster Berubah')
print('Repeat the Process from Step 2')
print('Centroid = New Centroid')
print('')
centroid = centroid1
Iterasi = Iterasi+1
dictclust=[]
35
for i in range(0,K):
temp=[]
for j in range(0,len(indeksKmeans2)):
if indeksKmeans2[j]==i:
for k in content[j]:
if k not in temp:
temp.append(k)
dictclust.append(temp)
print('=====================')
print('First Minor Iteration')
print('=====================')
print('1.Finding Cosine Similarity',end='')
Cosine = CosineSimilarity(centroid1,TFIDF)
print('...Done!')
print('======================')
print('Second Minor Iteration')
print('======================')
36
temp=[]
Prj=[];temp=[]
for i in range(0,len(Frj)):
for j in range(0,len(Frj[0])):
temp.append(Frj[i][j]/Frm[i])
Prj.append(temp)
temp=[]
Giny=[];temp=[];temp2=[]
for i in range(0,len(Prj)):
for j in range(0,K):
temp.append(math.pow(Prj[i][j],2))
Giny.append(sum(temp))
temp2.append(temp)
temp=[]
print('...Done!')
37
temp2.append(temp)
temp=[]
print('...Done!')
indeksSecondIteration=idx
for j in range(0,len(temp2)):
x = max(temp2[j]) #identify if discriminatory attribute assigned to other
cluster
idxx = temp2[j].index(x)
indeksSecondIteration[Ri[j]]=idxx
print('...Done!')
for i in range(0,len(content)):
if indeksFirstIteration[i] != indeksSecondIteration[i]:
print('Doc',i+1,'Move from Cluster',indeksFirstIteration[i]+1,'to
Cluster',indeksSecondIteration[i]+1)
38
Lampiran 2
Hasil Pemrograman :
6.Finding Weight(TF*IDF)...Done
TFIDF = Matrix 50 x 14161
[ 0.0 0.0 0.0 ... 0.0 ]
.
.
.
[ 1.6989700043360187 0.0 0.0 ... 1.6989700043360187 ]
Input any value to continue...
39
=====================================
Content Based Algorithm Using K-Means
=====================================
ITERATION 1
K=4
1.Initialize Random Centroid...Done
2.Compute Euclidian Distance...Done
3.Assigned Document to Closest Centroid...Done
ITERATION 2
1.Finding New Centroid
Finding Centroid 1...Done!
Finding Centroid 2...Done!
Finding Centroid 3...Done!
Finding Centroid 4...Done!
2.Compute Euclidian Distance Document to New Centroid...Done
3.Assigned Document to New Closest Centroid Done!
=====================
First Minor Iteration
=====================
1.Finding Cosine Similarity...Done!
2.Assign Document to Closest Cluster...Done!
3.Update Cluster Centroid Fisrt Minor Iteration
Finding Centroid 1...Done!
Finding Centroid 2...Done!
Finding Centroid 3...Done!
Finding Centroid 4...Done!
40
CLUSTER 1 : Doc 23 Doc 32 Doc 50
CLUSTER 2 : Doc 15
CLUSTER 3 : Doc 26 Doc 33
CLUSTER 4 : Doc 1 Doc 2 Doc 3 Doc 4 Doc 5 Doc 6 Doc 7 Doc 8 Doc 9 Doc
10 Doc 11 Doc 12 Doc 13 Doc 14 Doc 16 Doc 17 Doc 18 Doc 19 Doc 20 Doc
21 Doc 22 Doc 24 Doc 25 Doc 27 Doc 28 Doc 29 Doc 30 Doc 31 Doc 34 Doc
35 Doc 36 Doc 37 Doc 38 Doc 39 Doc 40 Doc 41 Doc 42 Doc 43 Doc 44 Doc
45 Doc 46 Doc 47 Doc 48 Doc 49
41