Word 2 Vec
Word 2 Vec
import numpy as np
import os
wordToInd = {}
indToWord = {}
count = 0
for ind, file in enumerate(newsgroups_train.data):
file=file.replace('\n',' ')
file = file.replace('\r', ' ')
for word in file.split():
#print(word)
if word not in wordToInd:
wordToInd[word] = count
indToWord[count] = word
count+=1
print(ind)
print(len(wordToInd))
docs = []
for file in newsgroups_train.data:
docs.append(file.split())
print(len(docs))
print(docs[0])
windowSize = 2
samples = []
#index = 0
for doc in range(len(docs)):
for index in range(len(docs[doc])):
for n in range(max(0,index-windowSize), min(index+windowSize, len(docs[doc])-
1)+1):
if n!=index:
samples.append([wordToInd[docs[doc][index]], wordToInd[docs[doc][n]]])
samples
samplesSource
print(samplesSource.shape)
tf.__version__
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.__version__
batchSize = 50000
embeddingSize = 50
vocabSize = len(wordToInd)
numNegativeSamples = 10
validationSize = 8
validationX = np.random.choice(validationSize * 10, validationSize, replace=False)
print('valid: ',validationX)
stddev=1.0/tf.sqrt(embeddingSize*1.0)))
nceBias = tf.Variable(tf.zeros(shape=[vocabSize]))
optOperation = tf.train.GradientDescentOptimizer(learning_rate=0.9).minimize(loss)
numEpochs = 2
learningRate = 0.9
numBatches = int(len(samples)/batchSize)
similarityScores = sess.run(similarity)
for i in range(validationSize):
topK= 5
similarWords = (-similarityScores[i, :]).argsort()[1:topK+1]
similarStrings = 'Similart to {0:}:'.format(indToWord[validationX[i]])
for k in range(topK):
similarStrings+=indToWord[similarWords[k]]+','
print(similarStrings)
finalEmbeddings = sess.run(normalizedEmbeddings)