0% found this document useful (0 votes)
19 views3 pages

Word 2 Vec

The document outlines a TensorFlow implementation for generating word embeddings using the 20 Newsgroups dataset. It includes data preprocessing, sample generation for training, and the setup of a neural network with negative sampling for training word embeddings. The training process is executed over multiple epochs, and the document also demonstrates how to find similar words based on the learned embeddings.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views3 pages

Word 2 Vec

The document outlines a TensorFlow implementation for generating word embeddings using the 20 Newsgroups dataset. It includes data preprocessing, sample generation for training, and the setup of a neural network with negative sampling for training word embeddings. The training process is executed over multiple epochs, and the document also demonstrates how to find similar words based on the learned embeddings.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

import tensorflow as tf

import numpy as np

from tensorflow import keras


from tensorflow.keras import layers

from sklearn.datasets import fetch_20newsgroups


newsgroups_train = fetch_20newsgroups(subset='train')

import os

wordToInd = {}
indToWord = {}
count = 0
for ind, file in enumerate(newsgroups_train.data):
file=file.replace('\n',' ')
file = file.replace('\r', ' ')
for word in file.split():
#print(word)
if word not in wordToInd:
wordToInd[word] = count
indToWord[count] = word
count+=1
print(ind)
print(len(wordToInd))

docs = []
for file in newsgroups_train.data:
docs.append(file.split())
print(len(docs))
print(docs[0])

Generation of samples(pairs of source and target)

windowSize = 2
samples = []
#index = 0
for doc in range(len(docs)):
for index in range(len(docs[doc])):
for n in range(max(0,index-windowSize), min(index+windowSize, len(docs[doc])-
1)+1):
if n!=index:
samples.append([wordToInd[docs[doc][index]], wordToInd[docs[doc][n]]])

samples

samplesSource = np.array([x[0] for x in samples])


samplesTarget = np.array([x[1] for x in samples])
samplesSource = np.reshape(samplesSource, newshape=[len(samplesSource),1])
samplesTarget = np.reshape(samplesTarget, newshape=[len(samplesTarget),1])

samplesSource

print(samplesSource.shape)

tf.__version__

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.__version__

batchSize = 50000
embeddingSize = 50
vocabSize = len(wordToInd)
numNegativeSamples = 10

source = tf.placeholder(tf.int32, shape=[batchSize], name='S')


target = tf.placeholder(tf.int32, shape=[batchSize,1], name='T')

validationSize = 8
validationX = np.random.choice(validationSize * 10, validationSize, replace=False)
print('valid: ',validationX)

sourceValidation = tf.constant(validationX, dtype=tf.int32)

embeddingMat = tf.Variable(tf.random_uniform(shape=[vocabSize, embeddingSize],


minval=-1, maxval=1), name=
'embedMatrix' )

getWordEmbedding = tf.nn.embedding_lookup(embeddingMat, source)

nceWeights = tf.Variable(tf.random.truncated_normal(shape = [vocabSize,


embeddingSize],

stddev=1.0/tf.sqrt(embeddingSize*1.0)))

nceBias = tf.Variable(tf.zeros(shape=[vocabSize]))

loss = tf.reduce_mean(tf.nn.nce_loss(weights= nceWeights,


biases= nceBias,
inputs= getWordEmbedding,
labels=target,
num_sampled=numNegativeSamples,
num_classes=vocabSize))

norm = tf.sqrt(tf.reduce_sum(tf.square(embeddingMat), 1, keep_dims=True))


normalizedEmbeddings = embeddingMat/norm
validationEmbeddings = tf.nn.embedding_lookup(normalizedEmbeddings,
sourceValidation)
similarity = tf.matmul(validationEmbeddings, normalizedEmbeddings, transpose_b=
True)

optOperation = tf.train.GradientDescentOptimizer(learning_rate=0.9).minimize(loss)

numEpochs = 2
learningRate = 0.9
numBatches = int(len(samples)/batchSize)

with tf.Session() as sess:


sess.run(tf.global_variables_initializer())
for epoch in range(numEpochs):
epochLoss = 0
for batch in range(numBatches):
indices = np.random.choice(len(samples), batchSize)
sourceBatch = samplesSource[indices]
sourceBatch = np.squeeze(sourceBatch)
targetBatch = samplesTarget[indices]
_, batchLoss = sess.run([optOperation, loss], feed_dict =
{source:sourceBatch, target:targetBatch})
epochLoss+=batchLoss
print('batchLoss:\t'+str(batchLoss)+ ':' + str(optOperation))
epochLoss = epochLoss/numBatches
print(str(epoch)+'\tavgEpochLoss:\t'+ str(epochLoss))

similarityScores = sess.run(similarity)
for i in range(validationSize):
topK= 5
similarWords = (-similarityScores[i, :]).argsort()[1:topK+1]
similarStrings = 'Similart to {0:}:'.format(indToWord[validationX[i]])
for k in range(topK):
similarStrings+=indToWord[similarWords[k]]+','
print(similarStrings)
finalEmbeddings = sess.run(normalizedEmbeddings)

You might also like