0% found this document useful (0 votes)

10 views9 pages

Word2Vec With TensorFlow ©yousef - DeepDorm

The document provides a detailed implementation of a Word2Vec model using TensorFlow, including data preprocessing, skip-gram generation, and training. It covers the creation of a vocabulary from a text dataset, the generation of training data with positive and negative samples, and the definition of a custom Word2Vec model class. The training process is demonstrated with accuracy and loss metrics displayed over multiple epochs.

Uploaded by

magno silva

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views9 pages

Word2Vec With TensorFlow ©yousef - DeepDorm

Uploaded by

magno silva

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

import io

import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

# Load the TensorBoard notebook extension

%load_ext tensorboard
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

sentence = "The wide road shimmered in the hot sun"

tokens = list(sentence.lower().split())
print(len(tokens))

vocab, index = {}, 1 # start indexing from 1

vocab['<pad>'] = 0 # add a padding token
for token in tokens:
if token not in vocab:
vocab[token] = index
index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5,

'hot': 6, 'sun': 7}

inverse_vocab = {index: token for token, index in vocab.items()}

print(inverse_vocab)
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in',

6: 'hot', 7: 'sun'}
[1, 2, 3, 4, 5, 1, 6, 7]

window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
example_sequence,
vocabulary_size=vocab_size,
window_size=window_size,
negative_samples=0)
print(len(positive_skip_grams))

26
for target, context in positive_skip_grams[:5]:
print(f"({target}, {context}): ({inverse_vocab[target]},
{inverse_vocab[context]})")

(3, 1): (road, the)

(5, 1): (in, the)
(6, 1): (hot, the)
(5, 4): (in, shimmered)
(4, 5): (shimmered, in)

# Get target and context words for one positive skip-gram.

target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.

num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"),

(1, 1))
negative_sampling_candidates, _, _ =
tf.random.log_uniform_candidate_sampler(
true_classes=context_class, # class that should be sampled as
'positive'
num_true=1, # each positive skip-gram has 1 positive context
class
num_sampled=num_ns, # number of negative context words to sample
unique=True, # all the negative samples should be unique
range_max=vocab_size, # pick index of the samples from [0,
vocab_size]
seed=SEED, # seed for reproducibility
name="negative_sampling" # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in
negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)

['wide', 'the', 'shimmered', 'road']

# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.

context = tf.concat([squeezed_context_class,
negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns`

`0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word
print(f"target_index : {target}")
print(f"target_word : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words : {[inverse_vocab[c.numpy()] for c in
context]}")
print(f"label : {label}")

target_index : 3
target_word : road
context_indices : [1 2 1 4 3]
context_words : ['the', 'wide', 'the', 'shimmered', 'road']
label : [1 0 0 0 0]

print("target :", target)

print("context :", context)
print("label :", label)

target : 3
context : tf.Tensor([1 2 1 4 3], shape=(5,), dtype=int64)
label : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)

sampling_table =
tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435

0.01212381 0.01347162 0.01474487 0.0159558 ]

# Generates skip-gram pairs with negative sampling for a list of

sequences
# (int-encoded sentences) based on window size, number of negative
samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size,
seed):
# Elements of each training example are appended to these lists.
targets, contexts, labels = [], [], []

# Build the sampling table for `vocab_size` tokens.

sampling_table =
tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

# Iterate over all sequences (sentences) in the dataset.

for sequence in tqdm.tqdm(sequences):

# Generate positive skip-gram pairs for a sequence (sentence).

positive_skip_grams, _ =
tf.keras.preprocessing.sequence.skipgrams(
sequence,
vocabulary_size=vocab_size,
sampling_table=sampling_table,
window_size=window_size,
negative_samples=0)
# Iterate over each positive skip-gram pair to produce training
examples
# with a positive context word and negative samples.
for target_word, context_word in positive_skip_grams:
context_class = tf.expand_dims(
tf.constant([context_word], dtype="int64"), 1)
negative_sampling_candidates, _, _ =
tf.random.log_uniform_candidate_sampler(
true_classes=context_class,
num_true=1,
num_sampled=num_ns,
unique=True,
range_max=vocab_size,
seed=seed,
name="negative_sampling")

# Build context and label vectors (for one target word)

context = tf.concat([tf.squeeze(context_class,1),
negative_sampling_candidates], 0)
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Append each element from the training example to global lists.

targets.append(target_word)
contexts.append(context)
labels.append(label)

return targets, contexts, labels

path_to_file = tf.keras.utils.get_file('shakespeare.txt',
'https://storage.googleapis.com/download.tensorflow.org/data/shakespea
re.txt')
with open(path_to_file) as f:
lines = f.read().splitlines()
for line in lines[:20]:
print(line)

Downloading data from

https://storage.googleapis.com/download.tensorflow.org/data/shakespear
e.txt
1115394/1115394 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?
All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.

text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x:
tf.cast(tf.strings.length(x), bool))

# Now, create a custom standardization function to lowercase the text

and
# remove punctuation.
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
return tf.strings.regex_replace(lowercase,
'[%s]' %
re.escape(string.punctuation), '')

# Define the vocabulary size and the number of words in a sequence.

vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map

strings to
# integers. Set the `output_sequence_length` length to pad all samples
to the
# same length.
vectorize_layer = layers.TextVectorization(
standardize=custom_standardization,
max_tokens=vocab_size,
output_mode='int',
output_sequence_length=sequence_length)

vectorize_layer.adapt(text_ds.batch(1024))
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])
# Vectorize the data in text_ds.
text_vector_ds =
text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that',
'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))
for seq in sequences[:5]:
print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

32777
[ 89 270 0 0 0 0 0 0 0 0] => ['first', 'citizen', '',
'', '', '', '', '', '', '']
[138 36 982 144 673 125 16 106 0 0] => ['before', 'we',
'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34 0 0 0 0 0 0 0 0 0] => ['all', '', '', '', '', '', '', '',
'', '']
[106 106 0 0 0 0 0 0 0 0] => ['speak', 'speak', '',
'', '', '', '', '', '', '']
[ 89 270 0 0 0 0 0 0 0 0] => ['first', 'citizen', '',
'', '', '', '', '', '', '']

targets, contexts, labels = generate_training_data(

sequences=sequences,
window_size=2,
num_ns=4,
vocab_size=vocab_size,
seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 32777/32777 [00:30<00:00, 1079.30it/s]

targets.shape: (65071,)
contexts.shape: (65071, 5)
labels.shape: (65071, 5)

BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts),
labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,
drop_remainder=True)
print(dataset)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)
<_BatchDataset element_spec=((TensorSpec(shape=(1024,),
dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None))>
<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,),
dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5),
dtype=tf.int64, name=None))>

class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim):
super(Word2Vec, self).__init__()
self.target_embedding = layers.Embedding(vocab_size,
embedding_dim,
name="w2v_embedding")
self.context_embedding = layers.Embedding(vocab_size,
embedding_dim)

def call(self, pair):

target, context = pair
# target: (batch, dummy?) # The dummy axis doesn't exist in
TF2.7+
# context: (batch, context)
if len(target.shape) == 2:
target = tf.squeeze(target, axis=1)
# target: (batch,)
word_emb = self.target_embedding(target)
# word_emb: (batch, embed)
context_emb = self.context_embedding(context)
# context_emb: (batch, context, embed)
dots = tf.einsum('be,bce->bc', word_emb, context_emb)
# dots: (batch, context)
return dots

def custom_loss(x_logit, y_true):

return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit,
labels=y_true)

embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',

loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])
#docs_infra: no_execute
%tensorboard --logdir logs
Epoch 1/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 3s 21ms/step - accuracy: 0.2169 - loss:
1.6089
Epoch 2/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.5987 - loss:
1.5893
Epoch 3/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.6072 - loss:
1.5316
Epoch 4/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step - accuracy: 0.5546 - loss:
1.4435
Epoch 5/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 24ms/step - accuracy: 0.5701 - loss:
1.3471
Epoch 6/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 14ms/step - accuracy: 0.6050 - loss:
1.2497
Epoch 7/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.6437 - loss:
1.1578
Epoch 8/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.6818 - loss:
1.0728
Epoch 9/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7160 - loss:
0.9944
Epoch 10/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7428 - loss:
0.9221
Epoch 11/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.7689 - loss:
0.8556
Epoch 12/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - accuracy: 0.7898 - loss:
0.7944
Epoch 13/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8084 - loss:
0.7382
Epoch 14/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8264 - loss:
0.6867
Epoch 15/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 22ms/step - accuracy: 0.8410 - loss:
0.6397
Epoch 16/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 2s 14ms/step - accuracy: 0.8546 - loss:
0.5967
Epoch 17/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.8672 - loss:
0.5575
Epoch 18/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.8782 - loss:
0.5219
Epoch 19/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step - accuracy: 0.8881 - loss:
0.4894
Epoch 20/20
63/63 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step - accuracy: 0.8968 - loss:
0.4598

<IPython.core.display.Javascript object>

Internal Resistance Project Class 12
100% (6)
Internal Resistance Project Class 12
16 pages
Neural Translation Model (Capstone Project)
No ratings yet
Neural Translation Model (Capstone Project)
20 pages
Module 1 - Introduction To Animal Science
No ratings yet
Module 1 - Introduction To Animal Science
13 pages
AGA Report 7-Measurement of Natural Gas by Turbine Meters
No ratings yet
AGA Report 7-Measurement of Natural Gas by Turbine Meters
77 pages
Auger Boring
No ratings yet
Auger Boring
6 pages
SOP For Export of Fruits and Vegetables To EU
100% (2)
SOP For Export of Fruits and Vegetables To EU
51 pages
Building Transformer Models With Attention Crash Course Build A Neural Machine Translator in 12 Days
No ratings yet
Building Transformer Models With Attention Crash Course Build A Neural Machine Translator in 12 Days
33 pages
Minh Hoa KTHK1 Anh 11 - Linh
No ratings yet
Minh Hoa KTHK1 Anh 11 - Linh
2 pages
900-Prof Ed - Questions
No ratings yet
900-Prof Ed - Questions
63 pages
AIND-Capstone - Machine - Translation - Ipynb at Master Tommytracey - AIND-Capstone
No ratings yet
AIND-Capstone - Machine - Translation - Ipynb at Master Tommytracey - AIND-Capstone
26 pages
Glove
100% (1)
Glove
10 pages
ATT III - 17. Application of Leadership and Teamworking Skills
No ratings yet
ATT III - 17. Application of Leadership and Teamworking Skills
6 pages
Petroleum Engineering 311 Reservoir Petr
No ratings yet
Petroleum Engineering 311 Reservoir Petr
224 pages
Cyclone Collection Efficiency PDF
No ratings yet
Cyclone Collection Efficiency PDF
11 pages
C3 W2
No ratings yet
C3 W2
89 pages
C3 W1
No ratings yet
C3 W1
71 pages
Natural Language Processing
No ratings yet
Natural Language Processing
17 pages
European Steel and Alloy Grades: Alloy Standards Search About Us EN 10277-4 EN 10084 EN 10297-1
No ratings yet
European Steel and Alloy Grades: Alloy Standards Search About Us EN 10277-4 EN 10084 EN 10297-1
2 pages
L4 Cse256 Fa24 We
No ratings yet
L4 Cse256 Fa24 We
68 pages
Ingles 2 - Parcial 2
No ratings yet
Ingles 2 - Parcial 2
7 pages
Sample Lesson Plan For JET Program Teaching Demo Carl Benson Vlogs Japan
No ratings yet
Sample Lesson Plan For JET Program Teaching Demo Carl Benson Vlogs Japan
2 pages
RNN LSTM From Scratch - Ipynb
No ratings yet
RNN LSTM From Scratch - Ipynb
55 pages
NLP Study Plan For Beginners - HW Samples
No ratings yet
NLP Study Plan For Beginners - HW Samples
47 pages
Qs Leadership in Construction
No ratings yet
Qs Leadership in Construction
2 pages
List of Dutch Inventions and Discoveries - Wikipedia, The Free Encyclopedia20151006224847
No ratings yet
List of Dutch Inventions and Discoveries - Wikipedia, The Free Encyclopedia20151006224847
131 pages
Divisibility Rules 86
No ratings yet
Divisibility Rules 86
60 pages
Tugas NLP - 1152000052 1
No ratings yet
Tugas NLP - 1152000052 1
14 pages
Data-Sheet FieldJointCoating
No ratings yet
Data-Sheet FieldJointCoating
2 pages
NLP PDF
No ratings yet
NLP PDF
17 pages
Image Captioning With Visual Attention PDF
No ratings yet
Image Captioning With Visual Attention PDF
16 pages
DL
No ratings yet
DL
17 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
CNN Text Classification
No ratings yet
CNN Text Classification
12 pages
HRD Final - 1
No ratings yet
HRD Final - 1
20 pages
A5 - Jupyter Notebook PDF
No ratings yet
A5 - Jupyter Notebook PDF
4 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
CL6 Winter 2024-25
No ratings yet
CL6 Winter 2024-25
4 pages
ASTW RA03 PracticalManual
No ratings yet
ASTW RA03 PracticalManual
18 pages
Exp 8 Machine Translation
No ratings yet
Exp 8 Machine Translation
11 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
No ratings yet
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
17 pages
NLP Assignment (917722H031)
No ratings yet
NLP Assignment (917722H031)
18 pages
تمثيل النص كموترات - تدريب - مايكروسوفت ليرن
No ratings yet
تمثيل النص كموترات - تدريب - مايكروسوفت ليرن
14 pages
DLL Mapeh G4 Q2 W1
No ratings yet
DLL Mapeh G4 Q2 W1
11 pages
Math 2016
No ratings yet
Math 2016
12 pages
Assignment 7
No ratings yet
Assignment 7
10 pages
DL 5
No ratings yet
DL 5
9 pages
Expt 5 Expt 6
No ratings yet
Expt 5 Expt 6
10 pages
NLP
No ratings yet
NLP
15 pages
NER Brahui NLP Project
No ratings yet
NER Brahui NLP Project
12 pages
NLP Assignment 4 (22bce9560)
No ratings yet
NLP Assignment 4 (22bce9560)
12 pages
Code Implé
No ratings yet
Code Implé
13 pages
DL - 20-WordEmbeddings - Ipynb - Colab
No ratings yet
DL - 20-WordEmbeddings - Ipynb - Colab
6 pages
Deep DL Manual Nainish
No ratings yet
Deep DL Manual Nainish
8 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
Sumati
No ratings yet
Sumati
10 pages
8 Followership 1 Prefi
No ratings yet
8 Followership 1 Prefi
3 pages
Ass5 DL Inp OUT
No ratings yet
Ass5 DL Inp OUT
5 pages
Adobe Scan 08 Jan 2025
No ratings yet
Adobe Scan 08 Jan 2025
7 pages
Next Word Prediction With NLP and Deep Learning
No ratings yet
Next Word Prediction With NLP and Deep Learning
13 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Clean Data
No ratings yet
Clean Data
4 pages
Sentiment Analysis Using LSTM
No ratings yet
Sentiment Analysis Using LSTM
5 pages
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
No ratings yet
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
8 pages
Prac 5
No ratings yet
Prac 5
3 pages
ME451: Control Systems Course Roadmap
No ratings yet
ME451: Control Systems Course Roadmap
5 pages
Integer-Encoding-Simplernn - Ipynb - Colaboratory
No ratings yet
Integer-Encoding-Simplernn - Ipynb - Colaboratory
4 pages
Lab
No ratings yet
Lab
8 pages
Word 2 Vec
No ratings yet
Word 2 Vec
3 pages
Sample
No ratings yet
Sample
6 pages
Practical No 05
No ratings yet
Practical No 05
4 pages
Practical No10
No ratings yet
Practical No10
4 pages
Corydoras
No ratings yet
Corydoras
2 pages
DL 6
No ratings yet
DL 6
4 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
Questionnaire 3 - MultiGroup Analysis
No ratings yet
Questionnaire 3 - MultiGroup Analysis
1 page
Digital Signal Processing
No ratings yet
Digital Signal Processing
3 pages
4TA05 - Razzan Respati - 10321297 - PrakRobotika - M7
No ratings yet
4TA05 - Razzan Respati - 10321297 - PrakRobotika - M7
4 pages
DL 6th Exp Program
No ratings yet
DL 6th Exp Program
3 pages
Genaii
No ratings yet
Genaii
5 pages
DL Lab 8 Excuted
No ratings yet
DL Lab 8 Excuted
3 pages
Simple NMT
No ratings yet
Simple NMT
3 pages
Dsbda 7
No ratings yet
Dsbda 7
1 page
DS 7
No ratings yet
DS 7
3 pages
Machine Translation Using Encoder
No ratings yet
Machine Translation Using Encoder
2 pages
SSC Geography
No ratings yet
SSC Geography
3 pages
Designing and Building A Sustainable
No ratings yet
Designing and Building A Sustainable
3 pages
Fall Convocation 2024 Graduation and Convocation - McGill University
No ratings yet
Fall Convocation 2024 Graduation and Convocation - McGill University
1 page
Rust Veto 4240 Pds 3
No ratings yet
Rust Veto 4240 Pds 3
1 page
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Word2Vec With TensorFlow ©yousef - DeepDorm

Uploaded by

Word2Vec With TensorFlow ©yousef - DeepDorm

Uploaded by

import io

# Load the TensorBoard notebook extension

sentence = "The wide road shimmered in the hot sun"

vocab, index = {}, 1 # start indexing from 1

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5,

inverse_vocab = {index: token for token, index in vocab.items()}

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in',

(3, 1): (road, the)

# Get target and context words for one positive skip-gram.

# Set the number of negative samples per positive context.

context_class = tf.reshape(tf.constant(context_word, dtype="int64"),

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)

# Concatenate a positive context word with negative sampled words.

# Label the first context word as `1` (positive) followed by `num_ns`

print("target :", target)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435

# Generates skip-gram pairs with negative sampling for a list of

# Build the sampling table for `vocab_size` tokens.

# Iterate over all sequences (sentences) in the dataset.

# Generate positive skip-gram pairs for a sequence (sentence).

# Build context and label vectors (for one target word)

# Append each element from the training example to global lists.

return targets, contexts, labels

Downloading data from

# Now, create a custom standardization function to lowercase the text

# Define the vocabulary size and the number of words in a sequence.

# Use the `TextVectorization` layer to normalize, split, and map

targets, contexts, labels = generate_training_data(

100%|██████████| 32777/32777 [00:30<00:00, 1079.30it/s]

def call(self, pair):

def custom_loss(x_logit, y_true):

You might also like