0% found this document useful (0 votes)
22 views7 pages

SOURCE CODE-image - To - Text

The document provides a comprehensive code for image captioning using the COCO dataset, including installation of necessary libraries, data loading, preprocessing, and model definition. It employs a combination of CNN and LSTM architectures to generate captions for images, detailing the steps for data generation, model training, and caption generation. The code also includes visualization of images and their respective annotations from the dataset.

Uploaded by

s45033966
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views7 pages

SOURCE CODE-image - To - Text

The document provides a comprehensive code for image captioning using the COCO dataset, including installation of necessary libraries, data loading, preprocessing, and model definition. It employs a combination of CNN and LSTM architectures to generate captions for images, detailing the steps for data generation, model training, and caption generation. The code also includes visualization of images and their respective annotations from the dataset.

Uploaded by

s45033966
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

SOURCE CODE:

!pip install CocoDataset==0.1.2


!wget
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip /content/annotations_trainval2017.zip
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip /content/train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip /content/val2017.zip
!pip install pycocotools
from pycocotools.coco import COCO # COCO python library
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
import random
import string
import cv2
import os
from pickle import dump, load
import json
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense,
Bidirectional, Input, Dropout, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.xception import Xception,
preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from keras.layers.merge import add
from tensorflow.keras.models import Model, load_model
from tqdm.notebook import tqdm
pylab.rcParams['figure.figsize'] = (8.0, 10.0)
coco=COCO("../content/annotations/instances_train2017.json")
cats = coco.loadCats(coco.getCatIds())
maincategories = list(set([cat['supercategory'] for cat in cats]))
print("Number of main categories: ", len(maincategories))
print("List of main categories: ", maincategories)
subcategories = [cat['name'] for cat in cats]
print("Number of sub categories: ", len(subcategories))
print("List of sub categories: ", subcategories)
catIds = coco.getCatIds(catNms=subcategories)
subcategories_Ids = dict()
for i in range(0,len(subcategories)):
subcategories_Ids[subcategories[i]] = catIds[i]
print("Sub categories with IDs :",subcategories_Ids)
subcategories_imageIds = dict()
for i in range(0,len(catIds)):
imgIds = coco.getImgIds(catIds=catIds[i])
img = []
for j in imgIds:
img.append(j)
subcategories_imageIds[subcategories[i]] = img
print("Sub categories with Image IDs :",len(subcategories_imageIds))
length_dict = {key: len(value) for key, value in
subcategories_imageIds.items()}
print("Total images in each sub categories: ", length_dict)
train_cats = subcategories_imageIds['bicycle'] +
subcategories_imageIds['airplane']
imgIdss = coco.getImgIds(imgIds = train_cats)
print("Total Images: ", len(imgIdss))
fig = plt.gcf()
fig.set_size_inches(16, 16)
next_pix = imgIdss
random.shuffle(next_pix)
for i, img_path in enumerate(next_pix[0:12]):
sp = plt.subplot(4, 4, i + 1)
sp.axis('Off')
img = coco.loadImgs(img_path)[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
plt.show()
fig = plt.gcf()
fig.set_size_inches(16, 16)
for i, img_path in enumerate(next_pix[0:12]):
sp = plt.subplot(4, 4, i + 1)
sp.axis('Off')
img = coco.loadImgs(img_path)[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds,
iscrowd=None)
anns = coco.loadAnns(annIds)
# print(anns)
coco.showAnns(anns)
plt.show()
annFile="../content/annotations/person_keypoints_train2017.json"
coco_kps=COCO(annFile)
fig = plt.gcf()
fig.set_size_inches(16, 16)
for i, img_path in enumerate(next_pix[0:12]):
sp = plt.subplot(4, 4, i + 1)
sp.axis('Off')
img = coco.loadImgs(img_path)[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco_kps.getAnnIds(imgIds=img['id'], catIds=catIds,
iscrowd=None)
anns = coco_kps.loadAnns(annIds)
coco_kps.showAnns(anns)
plt.show()
annFile = "../content/annotations/captions_train2017.json"
coco_caps=COCO(annFile)
img = coco.loadImgs(next_pix[0])[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco_caps.getAnnIds(imgIds=img['id']);
anns = coco_caps.loadAnns(annIds)
coco_caps.showAnns(anns)
plt.show()
img = coco.loadImgs(next_pix[1])[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco_caps.getAnnIds(imgIds=img['id']);
anns = coco_caps.loadAnns(annIds)
coco_caps.showAnns(anns)
plt.show()
img = coco.loadImgs(next_pix[10])[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco_caps.getAnnIds(imgIds=img['id']);
anns = coco_caps.loadAnns(annIds)
coco_caps.showAnns(anns)
plt.show()
print("Total images for training: ", len(imgIdss))
dataset = dict()
imgcaptions = []
for imgid in imgIdss:
img = coco.loadImgs(imgid)[0]
annIds = coco_caps.getAnnIds(imgIds=img['id']);
anns = coco_caps.loadAnns(annIds)
imgcaptions = []
for cap in anns:
# Remove punctuation
cap = cap['caption'].translate(str.maketrans('', '',
string.punctuation))
# Replace - to blank
cap = cap.replace("-"," ")
# Split string into word list and Convert each word into lower
case
cap = cap.split()
cap = [word.lower() for word in cap]
# join word list into sentence and <start> and <end> tag to
each sentence which helps
# LSTM encoder-decoder model while training.
cap = '<start> ' + " ".join(cap) + ' <end>'
imgcaptions.append(cap)
dataset[img['coco_url']] = imgcaptions
print("Length of Dataset: ",len(dataset))
print(dataset['http://images.cocodataset.org/train2017/000000047084.jpg
'])
#dataset
from itertools import chain
flatten_list = list(chain.from_iterable(dataset.values()))
#[[1,3],[4,8]] = [1,3,4,8]
tokenizer = Tokenizer(oov_token='<oov>') # For those words which are
not found in word_index
tokenizer.fit_on_texts(flatten_list)
total_words = len(tokenizer.word_index) + 1
print("Vocabulary length: ", total_words)
print("Bicycle ID: ", tokenizer.word_index['bicycle'])
print("Airplane ID: ", tokenizer.word_index['airplane'])
print("Image features length: ", len(image_features))
image_features['http://images.cocodataset.org/train2017/000000047084.jp
g'].shape
def dict_to_list(descriptions):
all_desc = []
for key in descriptions.keys():
[all_desc.append(d) for d in descriptions[key]]
return all_desc
def max_length(descriptions):
desc_list = dict_to_list(descriptions)
return max(len(d.split()) for d in desc_list)
max_length = max_length(dataset)
max_length
#create input-output sequence pairs from the image description.def
data_generator(descriptions, features, tokenizer, max_length):
while 1:
for key, description_list in descriptions.items():
feature = features[key][0]
input_image, input_sequence, output_word =
create_sequences(tokenizer, max_length, description_list, feature)
yield ([input_image, input_sequence], output_word)
def create_sequences(tokenizer, max_length, desc_list, feature):
X1, X2, y = list(), list(), list()
# walk through each description for the image
for desc in desc_list:
# encode the sequence
seq = tokenizer.texts_to_sequences([desc])[0]
# split one sequence into multiple X,y pairs
for i in range(1, len(seq)):
# split into input and output pair
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq],
num_classes=total_words)[0]
# store
X1.append(feature) # image features
X2.append(in_seq) # Caption input
y.append(out_seq) # Caption output
return np.array(X1), np.array(X2), np.array(y)
from tensorflow.keras.utils import plot_model
# define the captioning model
def define_model(total_words, max_length):
# features from the CNN model squeezed from 2048 to 256 nodes
inputs1 = Input(shape=(2048,))
49fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# LSTM sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(total_words, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
# Merging both models
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(total_words, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# summarize model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
return model
# train our model
print('Dataset: ', len(dataset))
print('Descriptions: train=', len(dataset))
print('Photos: train=', len(image_features))
print('Vocabulary Size:', total_words)
print('Description Length: ', max_length)
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
img_paths = ["../content/val2017/000000001761.jpg",
"../content/val2017/000000022396.jpg" ,
"../content/val2017/000000098520.jpg" ,
"../content/val2017/000000101762.jpg" ,
"../content/val2017/000000224051.jpg",
]
def extract_features(filename, model):
try:
image = Image.open(filename)
except:
print("ERROR: Couldn't open image! Make sure the image path
and extension is correct")
image = image.resize((299,299))
image = np.array(image)
# for images that has 4 channels, we convert them into 3
channels
if image.shape[2] == 4:
image = image[..., :3]
image = np.expand_dims(image, axis=0)
image = image/127.5
image = image - 1.0
feature = model.predict(image)
return feature
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
def generate_desc(model, tokenizer, photo, max_length):
in_text = 'start'
for i in range(max_length):
sequence = tokenizer.texts_to_sequences([in_text])[0]
sequence = pad_sequences([sequence], maxlen=max_length)
pred = model.predict([photo,sequence], verbose=0)
pred = np.argmax(pred)
word = word_for_id(pred, tokenizer)
if word is None:
break
in_text += ' ' + word
if word == 'end':
break
return in_text
#max_length = 46
#model = load_model('./models/model_0.h5')
xception_model = Xception(include_top=False, pooling="avg")
photo = extract_features(img_paths[0], xception_model)
img = Image.open(img_paths[0])
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)
photo = extract_features(img_paths[1], xception_model)
img = Image.open(img_paths[1])
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)
photo = extract_features(img_paths[2], xception_model)
img = Image.open(img_paths[2])
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)
photo = extract_features(img_paths[3], xception_model)
img = Image.open(img_paths[3])
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)
photo = extract_features(img_paths[4], xception_model)
img = Image.open(img_paths[4])
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)

You might also like