0% found this document useful (0 votes)

1 views

codefp1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.

Uploaded by

sanjana.devarapalli7

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

1 views

codefp1

Uploaded by

sanjana.devarapalli7

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 15

*1

from google.colab import files

# Upload kaggle.json

files.upload()

# Move kaggle.json to the proper directory

!mkdir -p ~/.kaggle

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

# Download dataset

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

# Unzip dataset

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

import os

from PIL import Image

import librosa

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file

audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

import os

from PIL import Image

import librosa

from IPython.display import display, Audio

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

image = Image.open(image_path)

display(image)

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

import pandas as pd
from torch.utils.data import Dataset

class Flickr8kAudioImageDataset(Dataset):

def init(self, mapping_file, image_dir, audio_dir, transform=None):

self.data = pd.read_csv(mapping_file)

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

def __len__(self):

return len(self.data)

def getitem(self, idx):

row = self.data.iloc[idx]

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

caption = row["caption"]

# Load image

image = Image.open(image_path)

if self.transform:

image = self.transform(image)

# Load audio

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

import os
image_dir = "/content/flickr8k_images/Images"

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_filenames = os.listdir(image_dir)

audio_filenames = os.listdir(audio_dir)

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

# Optional: Print the first few files to see which ones exist

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

import os

import torch

from torch.utils.data import Dataset, DataLoader

from PIL import Image

import librosa

from torchvision import transforms

import numpy as np

import matplotlib.pyplot as plt

class Flickr8kAudioImageDataset(Dataset):

def init(self, image_dir, audio_dir, transform=None, audio_length=22050):

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames

self.image_filenames = os.listdir(image_dir)

self.audio_filenames = os.listdir(audio_dir)

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

# Sort the audio filenames by their base name (remove suffix and strip extensions)

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

image_to_audio_map = {}

# Map image filenames to corresponding audio files (first occurrence for each)

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if

audio.startswith(image_base)]

if corresponding_audio:

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching

audio file

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

assert len(self.image_filenames) > 0, "No matching image and audio files found"

def __len__(self):

return len(self.image_filenames)

def getitem(self, idx):

# Get the image and audio file names

image_filename = self.image_filenames[idx]

audio_filename = self.audio_filenames[idx]

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

image = Image.open(image_path)

# Optionally apply transformations to the image

if self.transform:

image = self.transform(image)

else:

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

image = transform(image)

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

# Ensure the audio length matches the target length by padding or truncating

if len(audio) < self.audio_length:

# Pad with zeros if the audio is shorter than the target length

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

else:

# Truncate if the audio is longer than the target length

audio = audio[:self.audio_length]

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)

# Return image, audio, and the filename (for potential captions or other info)

return image, audio, audio_filename

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Initialize dataset and DataLoader

dataset = Flickr8kAudioImageDataset(

image_dir=image_dir,

audio_dir=audio_dir,

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.figure(figsize=(12, 6))

# Plot image

plt.subplot(1, 2, 1)

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

plt.title(f"Image: {filenames[0]}")
# Plot audio waveform

plt.subplot(1, 2, 2)

plt.plot(audios[0].numpy()) # Convert tensor to numpy for plotting

plt.title(f"Audio waveform: {filenames[0]}")

plt.show()

break # Only display the first batch, remove break to loop over all batches

import torch

import torch.nn as nn

import torch.optim as optim

# Define a simple CNN for image processing

class ImageModel(nn.Module):

def __init__(self):

super(ImageModel, self).__init__()

self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)

self.pool = nn.MaxPool2d(2, 2)

# Use adaptive pooling to handle varying input sizes

self.adaptive_pool = nn.AdaptiveAvgPool2d((8, 8))

self.fc1 = nn.Linear(32 * 8 * 8, 512)

self.fc2 = nn.Linear(512, 128)

def forward(self, x):

x = self.pool(nn.ReLU()(self.conv1(x)))

x = self.adaptive_pool(x) # Adaptive pooling to handle different input sizes

x = x.view(-1, 32 * 8 * 8) # Flatten the output

x = nn.ReLU()(self.fc1(x))

x = self.fc2(x)
return x

# Define an RNN for audio processing

class AudioModel(nn.Module):

def init(self, input_size=22050, hidden_size=128):

super(AudioModel, self).__init__()

self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)

self.fc = nn.Linear(hidden_size, 128)

def forward(self, x):

# Ensure input shape is (batch_size, seq_len, input_size)

# x should have shape [batch_size, seq_len, input_size]

out, _ = self.rnn(x) # out has shape (batch_size, seq_len, hidden_size)

# If the LSTM output is 2D (which might happen if you process sequences with fixed length)

# we need to handle this by adding an additional dimension

if out.dim() == 2:

out = out.unsqueeze(1) # Add an extra dimension to make it 3D: [batch_size, 1, hidden_size]

out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)

out = self.fc(out) # Pass through the fully connected layer

return out

# Update the MultimodalModel class

class MultimodalModel(nn.Module):

def init(self, num_classes):

super(MultimodalModel, self).__init__()

self.image_model = ImageModel()

self.audio_model = AudioModel()

self.fc = nn.Linear(128 + 128, num_classes) # For multiclass classification

def forward(self, image, audio):

image_features = self.image_model(image)

audio_features = self.audio_model(audio)

combined = torch.cat((image_features, audio_features), dim=1) # Concatenate image and audio

features

output = self.fc(combined)

return output

# In the training loop

criterion = nn.CrossEntropyLoss() # For multiclass classification

# In the forward pass

outputs = model(images, audios)

loss = criterion(outputs, labels) # Ensure labels are integer for multiclass

# Initialize the model

model = MultimodalModel()

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Training loop (simplified)

num_epochs = 10

for epoch in range(num_epochs):

model.train()

for images, audios, filenames in dataloader:

images, audios = images.to(device), audios.to(device) # Send data to GPU if available

optimizer.zero_grad()

# Forward pass

outputs = model(images, audios)

# Calculate loss (use appropriate target)

loss = criterion(outputs, torch.ones_like(outputs).to(device)) # Modify according to your target

# Backward pass

loss.backward()

optimizer.step()

print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

*10

from torch.utils.data import random_split

# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)

validation_split = 0.2

dataset_size = len(dataset)

validation_size = int(validation_split * dataset_size)

train_size = dataset_size - validation_size

# Split the dataset

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

*11

# Create DataLoaders

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

*12

def evaluate(model, validation_dataloader, criterion, device):

model.eval() # Set model to evaluation mode

validation_loss = 0.0

with torch.no_grad():

for images, audios, filenames in validation_dataloader:

images, audios = images.to(device), audios.to(device)

outputs = model(images, audios)

targets = get_target(filenames) # Implement target fetching logic based on filenames

loss = criterion(outputs, targets.to(device))

validation_loss += loss.item()

avg_validation_loss = validation_loss / len(validation_dataloader)

print(f"Validation Loss: {avg_validation_loss:.4f}")

*13

torch.save(model.state_dict(), 'best_model.pth')

*14

def getitem(self, idx):

image = Image.open(self.image_paths[idx]).convert("RGB") # Load image

label = self.labels[idx] # Get the corresponding label

if self.transform:

image = self.transform(image) # Apply transformations

return image, label # Ensure only two values are returned

*15

data = next(iter(train_dataloader))

print(len(data)) # Number of elements in the returned tuple

print(type(data)) # Check the type (it should be a tuple)

*16

images, labels, additional_info = next(iter(train_dataloader))

*17

print(labels.shape)

*18

import matplotlib.pyplot as plt

# Get a batch of images and labels (audio)

images, labels, filenames = next(iter(train_dataloader))

# Get the first audio sample (labels[0] is the audio)

audio = labels[0].cpu().numpy() # Convert the tensor to a numpy array

# Plot the audio waveform

plt.figure(figsize=(10, 4))

plt.plot(audio)

plt.title(f"Audio Waveform of Sample 0")

plt.xlabel("Time (samples)")

plt.ylabel("Amplitude")

plt.show()

*19

# Debugging label extraction

for filename in filenames[:5]:

print(f"Filename: {filename}, Extracted Label: {get_labels_from_filenames([filename])}")

*20

# Check predicted and actual labels

_, predicted = torch.max(outputs, 1)

print(f"Predicted: {predicted}, Actual: {labels}")

*21

# Visualize or print out image and label pairs

for i in range(5):

print(f"Image {i}: {images[i].shape}, Label: {labels[i]}")

*22

def get_labels_from_filenames(filenames):

labels = []

for filename in filenames:

# Example: assuming label is the first part of the filename

# For instance, if filenames are like 'class1_img_1.jpg', extract 'class1'

# You can customize this depending on your dataset

label = filename.split('_')[0] # Taking the first part before underscore

# Convert label to an integer (if needed, here assuming class labels are numeric)

# If labels are categorical, you may want to convert to class index

label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')

labels.append(label)

# Convert labels to a tensor (as long type)

labels = torch.tensor(labels, dtype=torch.long)

return labels

*23

correct = 0

total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch

images = images.to(device)

audios = audios.to(device)

# Extract labels from filenames (adjust this part based on how your dataset is structured)

labels = get_labels_from_filenames(filenames)

labels = labels.to(device) # Ensure labels are moved to the same device

# Forward pass

outputs = model(images, audios)

_, predicted = torch.max(outputs, 1)

# Calculate total and correct predictions

total += labels.size(0)

correct += (predicted == labels).sum().item()

# Calculate and print accuracy

accuracy = 100 * correct / total

print(f'Validation Accuracy: {accuracy}%')

The 'C' Odyssey: UNIX-The The 'C' Odyssey: UNIX-The Open Boundless C Open Boundless C
100% (1)
The 'C' Odyssey: UNIX-The The 'C' Odyssey: UNIX-The Open Boundless C Open Boundless C
1 page
Excel Advanced Level IR & Judaism
100% (6)
Excel Advanced Level IR & Judaism
59 pages
Code For Segmentation Into 277x277 Matrix
No ratings yet
Code For Segmentation Into 277x277 Matrix
9 pages
English To Urdu and Roman Urdu Dictionary
90% (29)
English To Urdu and Roman Urdu Dictionary
1,006 pages
Audio Classification
No ratings yet
Audio Classification
1 page
ATI.ipynb
No ratings yet
ATI.ipynb
12 pages
UrbanSound8K Dataset: Automatic Sound Recognition (ASR) Project with CNN and ANN Models
No ratings yet
UrbanSound8K Dataset: Automatic Sound Recognition (ASR) Project with CNN and ANN Models
31 pages
LEC 9
No ratings yet
LEC 9
87 pages
Guide To YAMNet - Sound Event Classifier
No ratings yet
Guide To YAMNet - Sound Event Classifier
10 pages
LEC 5
No ratings yet
LEC 5
69 pages
Audio Noise detection
No ratings yet
Audio Noise detection
29 pages
SoundGeneration Wiki
No ratings yet
SoundGeneration Wiki
7 pages
Pad Assignment 2
No ratings yet
Pad Assignment 2
12 pages
Flow Chart:: Input Audio Preprocessing
No ratings yet
Flow Chart:: Input Audio Preprocessing
14 pages
p4
No ratings yet
p4
21 pages
kijai-ComfyUI-VEnhancer
No ratings yet
kijai-ComfyUI-VEnhancer
76 pages
Project Guidelines_ AIML
No ratings yet
Project Guidelines_ AIML
30 pages
Audiosegment Readthedocs Io en Latest
No ratings yet
Audiosegment Readthedocs Io en Latest
23 pages
Video Api Endpoint N
No ratings yet
Video Api Endpoint N
7 pages
Project Proposal
No ratings yet
Project Proposal
22 pages
Predicting Singer Voice Using Convolutional Neural Network
No ratings yet
Predicting Singer Voice Using Convolutional Neural Network
17 pages
Editor de Audio
No ratings yet
Editor de Audio
5 pages
Finetuning
No ratings yet
Finetuning
10 pages
Voice_Identification_GLM4_Guide
No ratings yet
Voice_Identification_GLM4_Guide
2 pages
Ass 8
No ratings yet
Ass 8
2 pages
Audio GAN
No ratings yet
Audio GAN
2 pages
4-Channel YOLO Training Guide for RGB+IR Drone Detection
No ratings yet
4-Channel YOLO Training Guide for RGB+IR Drone Detection
22 pages
Ass
No ratings yet
Ass
5 pages
2.4_Zero-Shot_Audio_Classification
No ratings yet
2.4_Zero-Shot_Audio_Classification
3 pages
yolo_detect
No ratings yet
yolo_detect
5 pages
MusicGen - Ipynb - Colab
No ratings yet
MusicGen - Ipynb - Colab
12 pages
MSC Data Science - 02 PDF
No ratings yet
MSC Data Science - 02 PDF
37 pages
Tutorial Pytorch Best Commands
No ratings yet
Tutorial Pytorch Best Commands
8 pages
application_code_exp2 (2)
No ratings yet
application_code_exp2 (2)
4 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
DOC-20241209-WA0029.
No ratings yet
DOC-20241209-WA0029.
11 pages
Final Project Report
No ratings yet
Final Project Report
8 pages
Lab Report Solutions - Multimedia
No ratings yet
Lab Report Solutions - Multimedia
10 pages
Dinushasan Courseproject04: Sign in
No ratings yet
Dinushasan Courseproject04: Sign in
19 pages
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
No ratings yet
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
9 pages
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
No ratings yet
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
9 pages
mrac_paper1a
No ratings yet
mrac_paper1a
11 pages
Cad and Dog
No ratings yet
Cad and Dog
5 pages
NM final
No ratings yet
NM final
15 pages
Text-image embeddings with OpenAIs CLIP
No ratings yet
Text-image embeddings with OpenAIs CLIP
5 pages
Image Filtering and Hybrid Images
No ratings yet
Image Filtering and Hybrid Images
7 pages
RVCV2 Tools Mocci - Ipynb
No ratings yet
RVCV2 Tools Mocci - Ipynb
17 pages
FROMTXTTIMESERIESTOWAVEFILESANDSPECTROGRAMEXTRACTION SEISMIC JupyterNotebook
No ratings yet
FROMTXTTIMESERIESTOWAVEFILESANDSPECTROGRAMEXTRACTION SEISMIC JupyterNotebook
29 pages
Cad and Dog 2
No ratings yet
Cad and Dog 2
5 pages
Pre-Trained Models: Objectives
No ratings yet
Pre-Trained Models: Objectives
12 pages
CNN 1721592934
No ratings yet
CNN 1721592934
53 pages
Audio Effects - Conf-1 PDF
No ratings yet
Audio Effects - Conf-1 PDF
6 pages
02 W8 L22 P2-Training Sets 10-07
No ratings yet
02 W8 L22 P2-Training Sets 10-07
4 pages
CNN Lab Manual
No ratings yet
CNN Lab Manual
29 pages
Intro Ai Group3
No ratings yet
Intro Ai Group3
35 pages
CVDL TAE 63 (1)
No ratings yet
CVDL TAE 63 (1)
9 pages
APPENDIX A
No ratings yet
APPENDIX A
2 pages
Create Audio Effects App in Python
No ratings yet
Create Audio Effects App in Python
5 pages
Firebase Storage for Angular: A reliable file upload solution for your applications
From Everand
Firebase Storage for Angular: A reliable file upload solution for your applications
Abdelfattah Ragab
No ratings yet
DSPA - ET22BTEC046 - LAB3.ipynb - Colab
No ratings yet
DSPA - ET22BTEC046 - LAB3.ipynb - Colab
7 pages
MiniProj-3-Colorizing Old B&W Images_.Docx
No ratings yet
MiniProj-3-Colorizing Old B&W Images_.Docx
4 pages
Helping Blind People to Be Aware of the Logos Around Themselves
No ratings yet
Helping Blind People to Be Aware of the Logos Around Themselves
2 pages
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
LITERATURE SURVEY1
No ratings yet
LITERATURE SURVEY1
4 pages
Gvpce_nueve It 2025
0% (1)
Gvpce_nueve It 2025
28 pages
Cyber Security (MENTOR LED)
No ratings yet
Cyber Security (MENTOR LED)
18 pages
91.IMAGETEXTTOSPEECHCONVERSIONIN
No ratings yet
91.IMAGETEXTTOSPEECHCONVERSIONIN
11 pages
IJCRT2108410
No ratings yet
IJCRT2108410
5 pages
2203.14725v1
No ratings yet
2203.14725v1
5 pages
Generating Music Using AI: Ebba Rickard
No ratings yet
Generating Music Using AI: Ebba Rickard
66 pages
JS
No ratings yet
JS
14 pages
Sonos Network R13: Server Rack
No ratings yet
Sonos Network R13: Server Rack
3 pages
CLE - 10 Module - 122-125 H2o
100% (2)
CLE - 10 Module - 122-125 H2o
60 pages
EF3e Preint Quicktest 01
No ratings yet
EF3e Preint Quicktest 01
3 pages
Glasser 1964
No ratings yet
Glasser 1964
8 pages
Phrasal Verbs To Do With Education
No ratings yet
Phrasal Verbs To Do With Education
1 page
Tillu
No ratings yet
Tillu
1 page
METHOD If Ex Badi v56k Ibs Change Outbound
No ratings yet
METHOD If Ex Badi v56k Ibs Change Outbound
10 pages
Cs8493 Operating System 3
No ratings yet
Cs8493 Operating System 3
37 pages
العين الشريرة وعلم الموت ومقالات أخرى 2
No ratings yet
العين الشريرة وعلم الموت ومقالات أخرى 2
392 pages
Lesson 4 Your Hometown
No ratings yet
Lesson 4 Your Hometown
4 pages
Ngữ Pháp Tiếng Anh theo chủ đề - Copy
No ratings yet
Ngữ Pháp Tiếng Anh theo chủ đề - Copy
126 pages
IBIT Racer ActivityBookE
No ratings yet
IBIT Racer ActivityBookE
94 pages
Release Notes
No ratings yet
Release Notes
22 pages
37th YMCA PESA 2024 Secondary JCCI Category InfoKit (Final)
No ratings yet
37th YMCA PESA 2024 Secondary JCCI Category InfoKit (Final)
13 pages
Safety Sign
100% (1)
Safety Sign
40 pages
Solutions Elementary - Unit 2B - School Days (Grammar)
No ratings yet
Solutions Elementary - Unit 2B - School Days (Grammar)
19 pages
Guide Lesson Plan English
No ratings yet
Guide Lesson Plan English
4 pages
Your Heartfaith: Click From The List
No ratings yet
Your Heartfaith: Click From The List
731 pages
Assignment - 6 Solution
No ratings yet
Assignment - 6 Solution
6 pages
Decibel-Watt (DBW)
No ratings yet
Decibel-Watt (DBW)
6 pages
Life of Saint Cuthbert
No ratings yet
Life of Saint Cuthbert
16 pages
PR2 - Writes A Research Title - Lecture
No ratings yet
PR2 - Writes A Research Title - Lecture
17 pages
Comparative and Superlative Predictions
No ratings yet
Comparative and Superlative Predictions
2 pages
Action Plan in Reading 2021 2022
No ratings yet
Action Plan in Reading 2021 2022
1 page
Shakespeares Sonnet 64 - Quatrain 1
No ratings yet
Shakespeares Sonnet 64 - Quatrain 1
1 page
Adventurer-Club-Director
No ratings yet
Adventurer-Club-Director
2 pages
WEBD 236: Web Information Systems Programming
No ratings yet
WEBD 236: Web Information Systems Programming
60 pages

codefp1

Uploaded by

codefp1

Uploaded by

*1

from google.colab import files

# Move kaggle.json to the proper directory

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

from PIL import Image

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

from PIL import Image

from IPython.display import display, Audio

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

def __init__(self, mapping_file, image_dir, audio_dir, transform=None):

def __getitem__(self, idx):

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

from torch.utils.data import Dataset, DataLoader

from PIL import Image

from torchvision import transforms

import matplotlib.pyplot as plt

def __init__(self, image_dir, audio_dir, transform=None, audio_length=22050):

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

def __getitem__(self, idx):

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

# Optionally apply transformations to the image

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

if len(audio) < self.audio_length:

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

# Truncate if the audio is longer than the target length

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)

return image, audio, audio_filename

# Initialize dataset and DataLoader

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

def init(self, mapping_file, image_dir, audio_dir, transform=None):

def getitem(self, idx):

def init(self, image_dir, audio_dir, transform=None, audio_length=22050):

def getitem(self, idx):

def init(self, input_size=22050, hidden_size=128):

def init(self, num_classes):

def getitem(self, idx):