codefp1
codefp1
# Upload kaggle.json
files.upload()
!mkdir -p ~/.kaggle
*2
# Download dataset
# Unzip dataset
*3
*4
import os
import librosa
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file
audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename
*5
import os
import librosa
# Define paths
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_dir = "/content/flickr8k_images/Images"
image = Image.open(image_path)
display(image)
*6
import pandas as pd
from torch.utils.data import Dataset
class Flickr8kAudioImageDataset(Dataset):
self.data = pd.read_csv(mapping_file)
self.image_dir = image_dir
self.audio_dir = audio_dir
self.transform = transform
def __len__(self):
return len(self.data)
row = self.data.iloc[idx]
caption = row["caption"]
# Load image
image = Image.open(image_path)
if self.transform:
image = self.transform(image)
# Load audio
*7
import os
image_dir = "/content/flickr8k_images/Images"
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_filenames = os.listdir(image_dir)
audio_filenames = os.listdir(audio_dir)
# Optional: Print the first few files to see which ones exist
*8
import os
import torch
import librosa
import numpy as np
class Flickr8kAudioImageDataset(Dataset):
self.image_dir = image_dir
self.audio_dir = audio_dir
self.transform = transform
self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames
self.image_filenames = os.listdir(image_dir)
self.audio_filenames = os.listdir(audio_dir)
# Sort the audio filenames by their base name (remove suffix and strip extensions)
image_to_audio_map = {}
# Map image filenames to corresponding audio files (first occurrence for each)
if corresponding_audio:
assert len(self.image_filenames) > 0, "No matching image and audio files found"
def __len__(self):
return len(self.image_filenames)
image_filename = self.image_filenames[idx]
audio_filename = self.audio_filenames[idx]
image = Image.open(image_path)
if self.transform:
image = self.transform(image)
else:
image = transform(image)
# Ensure the audio length matches the target length by padding or truncating
# Pad with zeros if the audio is shorter than the target length
else:
audio = audio[:self.audio_length]
# Define paths
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_dir = "/content/flickr8k_images/Images"
dataset = Flickr8kAudioImageDataset(
image_dir=image_dir,
audio_dir=audio_dir,
print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])
plt.figure(figsize=(12, 6))
# Plot image
plt.subplot(1, 2, 1)
plt.title(f"Image: {filenames[0]}")
# Plot audio waveform
plt.subplot(1, 2, 2)
plt.show()
break # Only display the first batch, remove break to loop over all batches
*9
import torch
import torch.nn as nn
class ImageModel(nn.Module):
def __init__(self):
super(ImageModel, self).__init__()
self.pool = nn.MaxPool2d(2, 2)
x = self.pool(nn.ReLU()(self.conv1(x)))
x = nn.ReLU()(self.fc1(x))
x = self.fc2(x)
return x
class AudioModel(nn.Module):
super(AudioModel, self).__init__()
# If the LSTM output is 2D (which might happen if you process sequences with fixed length)
if out.dim() == 2:
out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)
return out
class MultimodalModel(nn.Module):
super(MultimodalModel, self).__init__()
self.image_model = ImageModel()
self.audio_model = AudioModel()
image_features = self.image_model(image)
audio_features = self.audio_model(audio)
output = self.fc(combined)
return output
model = MultimodalModel()
model.to(device)
num_epochs = 10
model.train()
# Forward pass
# Backward pass
loss.backward()
optimizer.step()
*10
# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)
validation_split = 0.2
dataset_size = len(dataset)
*11
# Create DataLoaders
validation_loss = 0.0
with torch.no_grad():
validation_loss += loss.item()
*13
torch.save(model.state_dict(), 'best_model.pth')
*14
if self.transform:
*15
data = next(iter(train_dataloader))
*17
print(labels.shape)
*18
plt.figure(figsize=(10, 4))
plt.plot(audio)
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")
plt.show()
*19
*20
*21
for i in range(5):
*22
def get_labels_from_filenames(filenames):
labels = []
# Convert label to an integer (if needed, here assuming class labels are numeric)
label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')
labels.append(label)
return labels
*23
correct = 0
total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch
images = images.to(device)
audios = audios.to(device)
# Extract labels from filenames (adjust this part based on how your dataset is structured)
labels = get_labels_from_filenames(filenames)
# Forward pass
_, predicted = torch.max(outputs, 1)
total += labels.size(0)