0% found this document useful (0 votes)
1 views

codefp1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views

codefp1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

*1

from google.colab import files

# Upload kaggle.json

files.upload()

# Move kaggle.json to the proper directory

!mkdir -p ~/.kaggle

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

*2

# Download dataset

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

# Unzip dataset

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

*3

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

*4

import os

from PIL import Image

import librosa

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file

audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

*5

import os

from PIL import Image

import librosa

from IPython.display import display, Audio

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

image = Image.open(image_path)

display(image)

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

*6

import pandas as pd
from torch.utils.data import Dataset

class Flickr8kAudioImageDataset(Dataset):

def __init__(self, mapping_file, image_dir, audio_dir, transform=None):

self.data = pd.read_csv(mapping_file)

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

def __len__(self):

return len(self.data)

def __getitem__(self, idx):

row = self.data.iloc[idx]

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

caption = row["caption"]

# Load image

image = Image.open(image_path)

if self.transform:

image = self.transform(image)

# Load audio

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

*7

import os
image_dir = "/content/flickr8k_images/Images"

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_filenames = os.listdir(image_dir)

audio_filenames = os.listdir(audio_dir)

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

# Optional: Print the first few files to see which ones exist

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

*8

import os

import torch

from torch.utils.data import Dataset, DataLoader

from PIL import Image

import librosa

from torchvision import transforms

import numpy as np

import matplotlib.pyplot as plt

class Flickr8kAudioImageDataset(Dataset):

def __init__(self, image_dir, audio_dir, transform=None, audio_length=22050):

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames

self.image_filenames = os.listdir(image_dir)

self.audio_filenames = os.listdir(audio_dir)

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

# Sort the audio filenames by their base name (remove suffix and strip extensions)

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

image_to_audio_map = {}

# Map image filenames to corresponding audio files (first occurrence for each)

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if


audio.startswith(image_base)]

if corresponding_audio:

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching


audio file

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

assert len(self.image_filenames) > 0, "No matching image and audio files found"

def __len__(self):

return len(self.image_filenames)

def __getitem__(self, idx):


# Get the image and audio file names

image_filename = self.image_filenames[idx]

audio_filename = self.audio_filenames[idx]

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

image = Image.open(image_path)

# Optionally apply transformations to the image

if self.transform:

image = self.transform(image)

else:

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

image = transform(image)

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

# Ensure the audio length matches the target length by padding or truncating

if len(audio) < self.audio_length:

# Pad with zeros if the audio is shorter than the target length

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

else:

# Truncate if the audio is longer than the target length

audio = audio[:self.audio_length]

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)


# Return image, audio, and the filename (for potential captions or other info)

return image, audio, audio_filename

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Initialize dataset and DataLoader

dataset = Flickr8kAudioImageDataset(

image_dir=image_dir,

audio_dir=audio_dir,

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.figure(figsize=(12, 6))

# Plot image

plt.subplot(1, 2, 1)

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

plt.title(f"Image: {filenames[0]}")
# Plot audio waveform

plt.subplot(1, 2, 2)

plt.plot(audios[0].numpy()) # Convert tensor to numpy for plotting

plt.title(f"Audio waveform: {filenames[0]}")

plt.show()

break # Only display the first batch, remove break to loop over all batches

*9

import torch

import torch.nn as nn

import torch.optim as optim

# Define a simple CNN for image processing

class ImageModel(nn.Module):

def __init__(self):

super(ImageModel, self).__init__()

self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)

self.pool = nn.MaxPool2d(2, 2)

# Use adaptive pooling to handle varying input sizes

self.adaptive_pool = nn.AdaptiveAvgPool2d((8, 8))

self.fc1 = nn.Linear(32 * 8 * 8, 512)

self.fc2 = nn.Linear(512, 128)

def forward(self, x):

x = self.pool(nn.ReLU()(self.conv1(x)))

x = self.adaptive_pool(x) # Adaptive pooling to handle different input sizes

x = x.view(-1, 32 * 8 * 8) # Flatten the output

x = nn.ReLU()(self.fc1(x))

x = self.fc2(x)
return x

# Define an RNN for audio processing

class AudioModel(nn.Module):

def __init__(self, input_size=22050, hidden_size=128):

super(AudioModel, self).__init__()

self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)

self.fc = nn.Linear(hidden_size, 128)

def forward(self, x):

# Ensure input shape is (batch_size, seq_len, input_size)

# x should have shape [batch_size, seq_len, input_size]

out, _ = self.rnn(x) # out has shape (batch_size, seq_len, hidden_size)

# If the LSTM output is 2D (which might happen if you process sequences with fixed length)

# we need to handle this by adding an additional dimension

if out.dim() == 2:

out = out.unsqueeze(1) # Add an extra dimension to make it 3D: [batch_size, 1, hidden_size]

out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)

out = self.fc(out) # Pass through the fully connected layer

return out

# Update the MultimodalModel class

class MultimodalModel(nn.Module):

def __init__(self, num_classes):

super(MultimodalModel, self).__init__()

self.image_model = ImageModel()

self.audio_model = AudioModel()

self.fc = nn.Linear(128 + 128, num_classes) # For multiclass classification


def forward(self, image, audio):

image_features = self.image_model(image)

audio_features = self.audio_model(audio)

combined = torch.cat((image_features, audio_features), dim=1) # Concatenate image and audio


features

output = self.fc(combined)

return output

# In the training loop

criterion = nn.CrossEntropyLoss() # For multiclass classification

# In the forward pass

outputs = model(images, audios)

loss = criterion(outputs, labels) # Ensure labels are integer for multiclass

# Initialize the model

model = MultimodalModel()

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Training loop (simplified)

num_epochs = 10

for epoch in range(num_epochs):

model.train()

for images, audios, filenames in dataloader:

images, audios = images.to(device), audios.to(device) # Send data to GPU if available


optimizer.zero_grad()

# Forward pass

outputs = model(images, audios)

# Calculate loss (use appropriate target)

loss = criterion(outputs, torch.ones_like(outputs).to(device)) # Modify according to your target

# Backward pass

loss.backward()

optimizer.step()

print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

*10

from torch.utils.data import random_split

# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)

validation_split = 0.2

dataset_size = len(dataset)

validation_size = int(validation_split * dataset_size)

train_size = dataset_size - validation_size

# Split the dataset

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

*11

# Create DataLoaders

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)


*12

def evaluate(model, validation_dataloader, criterion, device):

model.eval() # Set model to evaluation mode

validation_loss = 0.0

with torch.no_grad():

for images, audios, filenames in validation_dataloader:

images, audios = images.to(device), audios.to(device)

outputs = model(images, audios)

targets = get_target(filenames) # Implement target fetching logic based on filenames

loss = criterion(outputs, targets.to(device))

validation_loss += loss.item()

avg_validation_loss = validation_loss / len(validation_dataloader)

print(f"Validation Loss: {avg_validation_loss:.4f}")

*13

torch.save(model.state_dict(), 'best_model.pth')

*14

def __getitem__(self, idx):

image = Image.open(self.image_paths[idx]).convert("RGB") # Load image

label = self.labels[idx] # Get the corresponding label

if self.transform:

image = self.transform(image) # Apply transformations

return image, label # Ensure only two values are returned

*15

data = next(iter(train_dataloader))

print(len(data)) # Number of elements in the returned tuple

print(type(data)) # Check the type (it should be a tuple)


*16

images, labels, additional_info = next(iter(train_dataloader))

*17

print(labels.shape)

*18

import matplotlib.pyplot as plt

# Get a batch of images and labels (audio)

images, labels, filenames = next(iter(train_dataloader))

# Get the first audio sample (labels[0] is the audio)

audio = labels[0].cpu().numpy() # Convert the tensor to a numpy array

# Plot the audio waveform

plt.figure(figsize=(10, 4))

plt.plot(audio)

plt.title(f"Audio Waveform of Sample 0")

plt.xlabel("Time (samples)")

plt.ylabel("Amplitude")

plt.show()

*19

# Debugging label extraction

for filename in filenames[:5]:

print(f"Filename: {filename}, Extracted Label: {get_labels_from_filenames([filename])}")

*20

# Check predicted and actual labels


_, predicted = torch.max(outputs, 1)

print(f"Predicted: {predicted}, Actual: {labels}")

*21

# Visualize or print out image and label pairs

for i in range(5):

print(f"Image {i}: {images[i].shape}, Label: {labels[i]}")

*22

def get_labels_from_filenames(filenames):

labels = []

for filename in filenames:

# Example: assuming label is the first part of the filename

# For instance, if filenames are like 'class1_img_1.jpg', extract 'class1'

# You can customize this depending on your dataset

label = filename.split('_')[0] # Taking the first part before underscore

# Convert label to an integer (if needed, here assuming class labels are numeric)

# If labels are categorical, you may want to convert to class index

label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')

labels.append(label)

# Convert labels to a tensor (as long type)

labels = torch.tensor(labels, dtype=torch.long)

return labels

*23

correct = 0

total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch

images = images.to(device)

audios = audios.to(device)

# Extract labels from filenames (adjust this part based on how your dataset is structured)

labels = get_labels_from_filenames(filenames)

labels = labels.to(device) # Ensure labels are moved to the same device

# Forward pass

outputs = model(images, audios)

_, predicted = torch.max(outputs, 1)

# Calculate total and correct predictions

total += labels.size(0)

correct += (predicted == labels).sum().item()

# Calculate and print accuracy

accuracy = 100 * correct / total

print(f'Validation Accuracy: {accuracy}%')

You might also like