0% found this document useful (0 votes)
95 views

Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory

Uploaded by

Fahim Sa
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
95 views

Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory

Uploaded by

Fahim Sa
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

Licensed

#@title Licensed under the Apache License, Version under


2.0 (the the Apache
"License
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at License, Version 2.0 (the
#
# https://www.apache.org/licenses/LICENSE-2.0
"License");
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imp
# See the License for the specific language governing permissions and
# limitations under the License.

Open in Colab

Copyright 2019 The TensorFlow Authors.

Licensed under the Apache License, Version 2.0 (the "License");

import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

!wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
-O /tmp/bbc-text.csv

--2020-07-12 13:59:26-- https://storage.googleapis.com/laurencemoroney-blog.appspot.


Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 64.233.1
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443...
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [application/octet-stream]
Saving to: ‘/tmp/bbc-text.csv’

/tmp/bbc-text.csv 100%[===================>] 4.82M --.-KB/s in 0.03s

2020-07-12 13:59:27 (179 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_portion = .8

sentences = []
labels = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
print(len(stopwords))
# Expected Output
# 153

153

with open("/tmp/bbc-text.csv", 'r') as csvfile:


reader = csv.reader(csvfile, delimiter=',')
next(reader)
for row in reader:
labels.append(row[0])
sentence = row[1]
for word in stopwords:
token = " " + word + " "
sentence = sentence.replace(token, " ")
sentences.append(sentence)

print(len(labels))
print(len(sentences))
print(sentences[0])
# Expected Output
# 2225
# 2225
# tv future hands viewers home theatre systems plasma high-definition tvs digital video r

2225
2225
tv future hands viewers home theatre systems plasma high-definition tvs digital vid

train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

# Expected output (if training_portion=.8)


# 1780
# 1780
# 1780
# 445
# 445
1780
1780
1780
445
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
445
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

# Expected Ouput
# 449
# 120
# 200
# 120
# 192
# 120

449
120
200
120
192
120

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_le

print(len(validation_sequences))
print(validation_padded.shape)

# Expected output
# 445
# (445, 120)

445
(445, 120)

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

# Expected output
# [4]
# [2]
# [1]
# (1780, 1)
# [5]
# [4]
# [3]
# (445, 1)

[4]
[2]
[1]
(1780, 1)
[5]
[4]
[3]
(445, 1)

model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

# Expected Output
# Layer (type) Output Shape Param #
# =================================================================
# embedding (Embedding) (None, 120, 16) 16000
# _________________________________________________________________
# global_average_pooling1d (Gl (None, 16) 0
# _________________________________________________________________
# dense (Dense) (None, 24) 408
# _________________________________________________________________
# dense_1 (Dense) (None, 6) 150
# =================================================================
# Total params: 16,558
# Trainable params: 16,558
# Non-trainable params: 0
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 120, 16) 16000
_________________________________________________________________
global_average_pooling1d (Gl (None, 16) 0
_________________________________________________________________
dense (Dense) (None, 24) 408
_________________________________________________________________
dense_1 (Dense) (None, 6) 150
=================================================================
num_epochs
Total = 30
params: 16,558
Trainable
history params: 16,558
= model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(v
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
56/56 - 0s - loss: 1.7632 - accuracy: 0.2315 - val_loss: 1.7279 - val_accuracy: 0.229
Epoch 2/30
56/56 - 0s - loss: 1.6797 - accuracy: 0.2326 - val_loss: 1.6301 - val_accuracy: 0.269
Epoch 3/30
56/56 - 0s - loss: 1.5684 - accuracy: 0.4124 - val_loss: 1.5137 - val_accuracy: 0.429
Epoch 4/30
56/56 - 0s - loss: 1.4253 - accuracy: 0.4809 - val_loss: 1.3512 - val_accuracy: 0.525
Epoch 5/30
56/56 - 0s - loss: 1.2274 - accuracy: 0.5961 - val_loss: 1.1528 - val_accuracy: 0.671
Epoch 6/30
56/56 - 0s - loss: 1.0137 - accuracy: 0.7511 - val_loss: 0.9600 - val_accuracy: 0.813
Epoch 7/30
56/56 - 0s - loss: 0.8240 - accuracy: 0.8579 - val_loss: 0.8014 - val_accuracy: 0.860
Epoch
import 8/30
matplotlib.pyplot as plt
56/56 - 0s - loss: 0.6696 - accuracy: 0.9107 - val_loss: 0.6733 - val_accuracy: 0.885
Epoch 9/30
56/56 - 0s - loss: 0.5459 - accuracy: 0.9281 - val_loss: 0.5711 - val_accuracy: 0.901
def plot_graphs(history,
Epoch 10/30 string):
plt.plot(history.history[string])
56/56 - 0s - loss: 0.4440 - accuracy: 0.9438 - val_loss: 0.4865 - val_accuracy: 0.921
plt.plot(history.history['val_'+string])
Epoch 11/30
56/56 - 0s - loss: 0.3640 - accuracy:
plt.xlabel("Epochs") 0.9567 - val_loss: 0.4199 - val_accuracy: 0.921
Epoch 12/30
plt.ylabel(string)
56/56 - 0s - loss:
plt.legend([string, 0.3000 - accuracy:
'val_'+string]) 0.9596 - val_loss: 0.3700 - val_accuracy: 0.921
Epoch 13/30
plt.show()
56/56 - 0s - loss: 0.2512 - accuracy: 0.9691 - val_loss: 0.3320 - val_accuracy: 0.923
Epoch 14/30
plot_graphs(history, "accuracy")
56/56 - 0s - loss: 0.2149 - accuracy: 0.9725 - val_loss: 0.3016 - val_accuracy: 0.928
plot_graphs(history,
Epoch 15/30 "loss")
56/56 - 0s - loss: 0.1848 - accuracy: 0.9747 - val_loss: 0.2825 - val_accuracy: 0.928
Epoch 16/30
56/56 - 0s - loss: 0.1620 - accuracy: 0.9781 - val_loss: 0.2639 - val_accuracy: 0.932
Epoch 17/30
56/56 - 0s - loss: 0.1425 - accuracy: 0.9815 - val_loss: 0.2504 - val_accuracy: 0.934
Epoch 18/30
56/56 - 0s - loss: 0.1274 - accuracy: 0.9815 - val_loss: 0.2400 - val_accuracy: 0.932
Epoch 19/30
56/56 - 0s - loss: 0.1130 - accuracy: 0.9848 - val_loss: 0.2305 - val_accuracy: 0.932
Epoch 20/30
56/56 - 0s - loss: 0.1015 - accuracy: 0.9871 - val_loss: 0.2225 - val_accuracy: 0.932
Epoch 21/30
56/56 - 0s - loss: 0.0916 - accuracy: 0.9865 - val_loss: 0.2196 - val_accuracy: 0.934
Epoch 22/30
56/56 - 0s - loss: 0.0827 - accuracy: 0.9899 - val_loss: 0.2122 - val_accuracy: 0.934
Epoch 23/30
56/56 - 0s - loss: 0.0741 - accuracy: 0.9916 - val_loss: 0.2092 - val_accuracy: 0.932
Epoch 24/30
56/56 - 0s - loss: 0.0678 - accuracy: 0.9921 - val_loss: 0.2040 - val_accuracy: 0.934
Epoch 25/30
56/56 - 0s - loss: 0.0610 - accuracy: 0.9955 - val_loss: 0.2015 - val_accuracy: 0.937
Epoch 26/30
56/56 - 0s - loss: 0.0555 - accuracy: 0.9961 - val_loss: 0.1992 - val_accuracy: 0.939
Epoch 27/30
56/56 - 0s - loss: 0.0504 - accuracy: 0.9966 - val_loss: 0.1952 - val_accuracy: 0.941
Epoch 28/30
56/56 - 0s - loss: 0.0460 - accuracy: 0.9983 - val_loss: 0.1953 - val_accuracy: 0.939
Epoch 29/30
56/56 - 0s - loss: 0.0419 - accuracy: 0.9994 - val_loss: 0.1911 - val_accuracy: 0.948
Epoch 30/30
56/56 - 0s - loss: 0.0385 - accuracy: 0.9994 - val_loss: 0.1905 - val_accuracy: 0.939
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

# Expected output
# (1000, 16)

(1000, 16)

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')


out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
word = reverse_word_index[word_num]
embeddings = weights[word_num]
out_m.write(word + "\n")
out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

try:
from google.colab import files
except ImportError:
pass
else:
files.download('vecs.tsv')
files.download('meta.tsv')

You might also like