0% found this document useful (0 votes)
1 views22 pages

AI Lab6

The document contains a report on artificial intelligence lab work, specifically focusing on a neural machine translation model using PyTorch. It includes code for data preparation, vocabulary creation, model definition, training, and evaluation processes for translating English to Vietnamese. The report also discusses the implementation of both RNN and LSTM models, along with their respective training and testing methodologies.

Uploaded by

Thành Đạt D1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views22 pages

AI Lab6

The document contains a report on artificial intelligence lab work, specifically focusing on a neural machine translation model using PyTorch. It includes code for data preparation, vocabulary creation, model definition, training, and evaluation processes for translating English to Vietnamese. The report also discusses the implementation of both RNN and LSTM models, along with their respective training and testing methodologies.

Uploaded by

Thành Đạt D1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 22

Artificial Intelligence Lab Work (6)

レポート解答用紙 (Report Answer Sheet)

学生証番号 (Student ID): 22520205


名前(Name): Cao Thành Đạt (Cao Thanh Dat/カオ・タイン・ダット)

問題 1.
(プログラム)
import requests

import torch

import torch.nn.functional as F

import nltk

import tarfile

def iwslt15(train_test):

url = "https://github.com/stefan-it/nmt-en-vi/raw/master/data/"

r = requests.get(url+train_test+"-en-vi.tgz")

filename = train_test+"-en-vi.tar.gz"

with open(filename, 'wb') as f:

f.write(r.content)

tarfile.open(filename, 'r:gz').extractall("iwslt15")

iwslt15("train")

iwslt15("test-2013")

f = open("iwslt15/train.en")

train_en = [line.split() for line in f]

f.close()

f = open("iwslt15/train.vi")

train_vi = [line.split() for line in f]

f.close()

f = open("iwslt15/tst2013.en")

test_en = [line.split() for line in f]

f.close()
f = open("iwslt15/tst2013.vi")

test_vi = [line.split() for line in f]

f.close()

for i in range(10):

print(train_en[i])

print(train_vi[i])

print("# of line", len(train_en), len(train_vi), len(test_en), len(test_vi))

MODELNAME = "iwslt15-en-vi-rnn.model"

EPOCH = 10

BATCHSIZE = 128

LR = 0.0001

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def make_vocab(train_data, min_freq):

vocab = {}

for tokenlist in train_data:

for token in tokenlist:

if token not in vocab:

vocab[token] = 0

vocab[token] += 1

vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('eeos>', 0)]

vocabidx = {}

for token, freq in vocab.items():

if freq >= min_freq:

idx = len(vocablist)

vocablist.append((token, freq))

vocabidx[token] = idx

vocabidx['<unk>'] = 0

vocabidx['<pad>'] = 1

vocabidx['<cls>'] = 2
vocabidx['<eos>'] = 3

return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 3)

vocablist_vi, vocabidx_vi = make_vocab(train_vi, 3)

print("vocab size en:", len(vocablist_en))

print("vocab size vi:", len(vocablist_vi))

def preprocess(data, vocabidx):

rr = []

for tokenlist in data:

tkl = ['<cls>']

for token in tokenlist:

tkl.append(token if token in vocabidx else '<unk>')

tkl.append('<eos>')

rr.append(tkl)

return rr

train_en_prep = preprocess(train_en, vocabidx_en)

train_vi_prep = preprocess(train_vi, vocabidx_vi)

test_en_prep = preprocess(test_en, vocabidx_en)

for i in range(5):

print(train_en_prep[i])

print(train_vi_prep[i])

print(test_en_prep[i])

train_data = list(zip(train_en_prep, train_vi_prep))

train_data.sort(key = lambda x: (len(x[0]), len(x[1])))

test_data = list(zip(test_en_prep, test_en, test_vi))


for i in range(5):

print(train_data[i])

for i in range(5):

print(test_data[i])

def make_batch(data, batchsize):

bb = []

ben =[]

bvi = []

for en, vi in data:

ben.append(en)

bvi.append(vi)

if len(ben) >= batchsize:

bb.append((ben, bvi))

ben = []

bvi = []

if len(ben) > 0:

bb.append((ben, bvi))

return bb

train_data = make_batch(train_data, BATCHSIZE)

for i in range(5):

print(train_data[i])

def padding_batch(b):

maxlen = max([len(x) for x in b])

for tkl in b:

for i in range(maxlen - len(tkl)):

tkl.append('<pad>')

def padding(bb):
for ben, bvi in bb:

padding_batch(ben)

padding_batch(bvi)

padding(train_data)

for i in range(3):

print(train_data[i])

train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in

ben],

[[vocabidx_vi[token] for token in tokenlist] for tokenlist in

bvi]) for ben, bvi in train_data]

test_data = [([vocabidx_en[token] for token in enprep], en, vi) for enprep, en ,

vi in test_data]

for i in range(3):

print(train_data[i])

for i in range(3):

print(test_data[i])

class RNNEncDec(torch.nn.Module):

def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):

super(RNNEncDec, self).__init__()

self.encemb = torch.nn.Embedding(len(vocablist_x), 300,

padding_idx=vocabidx_x['<pad>'])

self.encrnn = torch.nn.Linear(300, 300)

self.decemb = torch.nn.Embedding(len(vocablist_y), 300,

padding_idx=vocabidx_y['<pad>'])

self.decrnn = torch.nn.Linear(300, 300)

self.decout = torch.nn.Linear(300, len(vocablist_y))

def forward(self, x):

x, y = x[0], x[1]
e_x = self.encemb(x)

n_x = e_x.size()[0]

h = torch.zeros(300, dtype=torch.float32).to(DEVICE)

for i in range(n_x):

h = F.relu(e_x[i] + self.encrnn(h))

e_y = self.decemb(y)

n_y = e_y.size()[0]

loss = torch.tensor(0., dtype=torch.float32).to(DEVICE)

for i in range(n_y-1):

h = F.relu(e_y[i] + self.decrnn(h))

loss += F.cross_entropy(self.decout(h), y[i+1])

return loss

def evaluate(self, x, vocablist_y, vocabidx_y):

e_x = self.encemb(x)

n_x = e_x.size()[0]

h = torch.zeros(300, dtype=torch.float32).to(DEVICE)

for i in range(n_x):

h = F.relu(e_x[i] + self.encrnn(h))

y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)

e_y = self.decemb(y)

pred = []

for i in range(30):

h = F.relu(e_y + self.decrnn(h))

pred_id = self.decout(h).squeeze().argmax()

if pred_id == vocabidx_y['<eos>']:

break

pred_y = vocablist_y[pred_id][0]

pred.append(pred_y)

y[0] = pred_id
e_y = self.decemb(y)

return pred

def train():

model = RNNEncDec(vocablist_en, vocabidx_en, vocablist_vi,

vocabidx_vi).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in range(EPOCH):

loss = 0

step = 0

for ben, bvi in train_data:

ben = torch.tensor(ben, dtype=torch.int64).transpose(0, 1).to(DEVICE)

bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0, 1).to(DEVICE)

optimizer.zero_grad()

batchloss = model((ben, bvi))

batchloss.backward()

optimizer.step()

loss = loss + batchloss.item()

if step % 100 == 0:

print("step:", step, "batchloss:", batchloss.item())

step += 1

print("epoch", epoch, ": loss", loss)

torch.save(model.state_dict(), MODELNAME)

def test():

total = 0

correct = 0

model = RNNEncDec(vocablist_en, vocabidx_en, vocablist_vi,

vocabidx_vi).to(DEVICE)

model.load_state_dict(torch.load(MODELNAME))

model.eval()

ref = []
pred = []

for enprep, en, vi in test_data:

input = torch.tensor([enprep], dtype=torch.int64).transpose(0, 1).to(DEVICE)

p = model.evaluate(input, vocablist_vi, vocabidx_vi)

if len(ref) < 10:

print("INPUT", en)

print("REF", vi)

print("MT", p)

ref.append([vi])

pred.append(p)

bleu = nltk.translate.bleu_score.corpus_bleu(ref, pred)

print("total:", len(test_data))

print("bleu:", bleu)

train()

test()
(実行結果)
問題 2.
(プログラム)
import requests

import torch

import torch.nn.functional as F

import nltk

import tarfile

def iwslt15(train_test):

url = "https://github.com/stefan-it/nmt-en-vi/raw/master/data/"

r = requests.get(url+train_test+"-en-vi.tgz")

filename = train_test+"-en-vi.tar.gz"

with open(filename, 'wb') as f:

f.write(r.content)

tarfile.open(filename, 'r:gz').extractall("iwslt15")

iwslt15("train")

iwslt15("test-2013")

f = open("iwslt15/train.en")

train_en = [line.split() for line in f]

f.close()

f = open("iwslt15/train.vi")

train_vi = [line.split() for line in f]

f.close()

f = open("iwslt15/tst2013.en")

test_en = [line.split() for line in f]

f.close()

f = open("iwslt15/tst2013.vi")

test_vi = [line.split() for line in f]

f.close()

for i in range(10):

print(train_en[i])
print(train_vi[i])

print("# of line", len(train_en), len(train_vi), len(test_en), len(test_vi))

MODELNAME = "iwslt15-en-vi-lstm-dropout.model"

EPOCH = 10

BATCHSIZE = 128

LR = 0.0001

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def make_vocab(train_data, min_freq):

vocab = {}

for tokenlist in train_data:

for token in tokenlist:

if token not in vocab:

vocab[token] = 0

vocab[token] += 1

vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('eeos>', 0)]

vocabidx = {}

for token, freq in vocab.items():

if freq >= min_freq:

idx = len(vocablist)

vocablist.append((token, freq))

vocabidx[token] = idx

vocabidx['<unk>'] = 0

vocabidx['<pad>'] = 1

vocabidx['<cls>'] = 2

vocabidx['<eos>'] = 3

return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 3)

vocablist_vi, vocabidx_vi = make_vocab(train_vi, 3)


print("vocab size en:", len(vocablist_en))

print("vocab size vi:", len(vocablist_vi))

def preprocess(data, vocabidx):

rr = []

for tokenlist in data:

tkl = ['<cls>']

for token in tokenlist:

tkl.append(token if token in vocabidx else '<unk>')

tkl.append('<eos>')

rr.append(tkl)

return rr

train_en_prep = preprocess(train_en, vocabidx_en)

train_vi_prep = preprocess(train_vi, vocabidx_vi)

test_en_prep = preprocess(test_en, vocabidx_en)

for i in range(5):

print(train_en_prep[i])

print(train_vi_prep[i])

print(test_en_prep[i])

train_data = list(zip(train_en_prep, train_vi_prep))

train_data.sort(key = lambda x: (len(x[0]), len(x[1])))

test_data = list(zip(test_en_prep, test_en, test_vi))

for i in range(5):

print(train_data[i])

for i in range(5):

print(test_data[i])

def make_batch(data, batchsize):


bb = []

ben =[]

bvi = []

for en, vi in data:

ben.append(en)

bvi.append(vi)

if len(ben) >= batchsize:

bb.append((ben, bvi))

ben = []

bvi = []

if len(ben) > 0:

bb.append((ben, bvi))

return bb

train_data = make_batch(train_data, BATCHSIZE)

for i in range(5):

print(train_data[i])

def padding_batch(b):

maxlen = max([len(x) for x in b])

for tkl in b:

for i in range(maxlen - len(tkl)):

tkl.append('<pad>')

def padding(bb):

for ben, bvi in bb:

padding_batch(ben)

padding_batch(bvi)

padding(train_data)
for i in range(3):

print(train_data[i])

train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in

ben],

[[vocabidx_vi[token] for token in tokenlist] for tokenlist in

bvi]) for ben, bvi in train_data]

test_data = [([vocabidx_en[token] for token in enprep], en, vi) for enprep, en ,

vi in test_data]

for i in range(3):

print(train_data[i])

for i in range(3):

print(test_data[i])

class LSTMEncDec(torch.nn.Module):

def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):

super(LSTMEncDec, self).__init__()

self.vocabidx_y = vocabidx_y

self.encemb = torch.nn.Embedding(len(vocablist_x), 256,

padding_idx=vocabidx_x['<pad>'])

self.enc_lstm = torch.nn.LSTM(256, 516, 2, dropout=0.5)

self.decemb = torch.nn.Embedding(len(vocablist_y), 256,

padding_idx=vocabidx_y['<pad>'])

self.dec_lstm = torch.nn.LSTM(256, 516, 2, dropout=0.5)

self.dropout = torch.nn.Dropout(0.5)

self.decout = torch.nn.Linear(516, len(vocablist_y))

def forward(self, x):

src, tgt = x[0], x[1]

emb_src = self.encemb(src)

_, (h, c) = self.enc_lstm(emb_src)
emb_tgt = self.decemb(tgt)

outputs, _ = self.dec_lstm(emb_tgt, (h, c))

outputs = self.dropout(outputs)

logits = self.decout(outputs)

logits = logits[:-1].reshape(-1, logits.size(-1))

tgt_y = tgt[1:].reshape(-1)

loss = F.cross_entropy(logits, tgt_y,

ignore_index=self.vocabidx_y['<pad>'])

return loss

def evaluate(self, x, vocablist_y, vocabidx_y, max_len=50):

emb_src = self.encemb(x)

_, (h, c) = self.enc_lstm(emb_src)

inputs = torch.tensor([[vocabidx_y['<cls>']]]).to(x.device)

pred = []

for _ in range(max_len):

emb = self.decemb(inputs)

output, (h, c) = self.dec_lstm(emb, (h, c))

output = self.dropout(output)

logits = self.decout(output.squeeze(0))

next_id = logits.argmax(dim=-1)

if next_id.item() == vocabidx_y['<eos>']:

break

pred.append(vocablist_y[next_id.item()][0])

inputs = next_id.unsqueeze(0)

return pred

def train():

model = LSTMEncDec(vocablist_en, vocabidx_en, vocablist_vi,

vocabidx_vi).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in range(EPOCH):


loss = 0

step = 0

for ben, bvi in train_data:

ben = torch.tensor(ben, dtype=torch.int64).transpose(0, 1).to(DEVICE)

bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0, 1).to(DEVICE)

optimizer.zero_grad()

batchloss = model((ben, bvi))

batchloss.backward()

optimizer.step()

loss = loss + batchloss.item()

if step % 100 == 0:

print("step:", step, "batchloss:", batchloss.item())

step += 1

print("epoch", epoch, ": loss", loss)

torch.save(model.state_dict(), MODELNAME)

def test():

total = 0

correct = 0

model = LSTMEncDec(vocablist_en, vocabidx_en, vocablist_vi,

vocabidx_vi).to(DEVICE)

model.load_state_dict(torch.load(MODELNAME))

model.eval()

ref = []

pred = []

for enprep, en, vi in test_data:

input = torch.tensor([enprep], dtype=torch.int64).transpose(0, 1).to(DEVICE)

p = model.evaluate(input, vocablist_vi, vocabidx_vi)

if len(ref) < 10:

print("INPUT", en)

print("REF", vi)

print("MT", p)
ref.append([vi])

pred.append(p)

bleu = nltk.translate.bleu_score.corpus_bleu(ref, pred)

print("total:", len(test_data))

print("bleu:", bleu)

train()

test()
(実行結果)

You might also like