Word2Vec With TensorFlow ©yousef - DeepDorm
Word2Vec With TensorFlow ©yousef - DeepDorm
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
example_sequence,
vocabulary_size=vocab_size,
window_size=window_size,
negative_samples=0)
print(len(positive_skip_grams))
26
for target, context in positive_skip_grams[:5]:
print(f"({target}, {context}): ({inverse_vocab[target]},
{inverse_vocab[context]})")
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)
target_index : 3
target_word : road
context_indices : [1 2 1 4 3]
context_words : ['the', 'wide', 'the', 'shimmered', 'road']
label : [1 0 0 0 0]
target : 3
context : tf.Tensor([1 2 1 4 3], shape=(5,), dtype=int64)
label : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)
sampling_table =
tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)
path_to_file = tf.keras.utils.get_file('shakespeare.txt',
'https://storage.googleapis.com/download.tensorflow.org/data/shakespea
re.txt')
with open(path_to_file) as f:
lines = f.read().splitlines()
for line in lines[:20]:
print(line)
All:
Speak, speak.
First Citizen:
You are all resolved rather to die than to famish?
All:
Resolved. resolved.
First Citizen:
First, you know Caius Marcius is chief enemy to the people.
All:
We know't, we know't.
First Citizen:
Let us kill him, and we'll have corn at our own price.
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x:
tf.cast(tf.strings.length(x), bool))
vectorize_layer.adapt(text_ds.batch(1024))
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])
# Vectorize the data in text_ds.
text_vector_ds =
text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that',
'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))
for seq in sequences[:5]:
print(f"{seq} => {[inverse_vocab[i] for i in seq]}")
32777
[ 89 270 0 0 0 0 0 0 0 0] => ['first', 'citizen', '',
'', '', '', '', '', '', '']
[138 36 982 144 673 125 16 106 0 0] => ['before', 'we',
'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34 0 0 0 0 0 0 0 0 0] => ['all', '', '', '', '', '', '', '',
'', '']
[106 106 0 0 0 0 0 0 0 0] => ['speak', 'speak', '',
'', '', '', '', '', '', '']
[ 89 270 0 0 0 0 0 0 0 0] => ['first', 'citizen', '',
'', '', '', '', '', '', '']
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)
print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")
targets.shape: (65071,)
contexts.shape: (65071, 5)
labels.shape: (65071, 5)
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts),
labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,
drop_remainder=True)
print(dataset)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)
<_BatchDataset element_spec=((TensorSpec(shape=(1024,),
dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None))>
<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,),
dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None))>
class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim):
super(Word2Vec, self).__init__()
self.target_embedding = layers.Embedding(vocab_size,
embedding_dim,
name="w2v_embedding")
self.context_embedding = layers.Embedding(vocab_size,
embedding_dim)
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])
#docs_infra: no_execute
%tensorboard --logdir logs
Epoch 1/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 3s 21ms/step - accuracy: 0.2169 - loss:
1.6089
Epoch 2/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.5987 - loss:
1.5893
Epoch 3/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.6072 - loss:
1.5316
Epoch 4/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step - accuracy: 0.5546 - loss:
1.4435
Epoch 5/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 24ms/step - accuracy: 0.5701 - loss:
1.3471
Epoch 6/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 14ms/step - accuracy: 0.6050 - loss:
1.2497
Epoch 7/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.6437 - loss:
1.1578
Epoch 8/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.6818 - loss:
1.0728
Epoch 9/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7160 - loss:
0.9944
Epoch 10/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7428 - loss:
0.9221
Epoch 11/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7689 - loss:
0.8556
Epoch 12/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - accuracy: 0.7898 - loss:
0.7944
Epoch 13/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8084 - loss:
0.7382
Epoch 14/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8264 - loss:
0.6867
Epoch 15/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 22ms/step - accuracy: 0.8410 - loss:
0.6397
Epoch 16/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 14ms/step - accuracy: 0.8546 - loss:
0.5967
Epoch 17/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.8672 - loss:
0.5575
Epoch 18/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.8782 - loss:
0.5219
Epoch 19/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8881 - loss:
0.4894
Epoch 20/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - accuracy: 0.8968 - loss:
0.4598
<IPython.core.display.Javascript object>