0% found this document useful (0 votes)
6 views

Code File

The document outlines a Vision Transformer-based system for copy-move forgery detection (CMFD) using synthetic dataset generation and a multi-modal attention model. It includes classes for generating synthetic images, a dataset class, model architecture, training utilities, and a Gradio interface for predictions. The system is designed to train on generated data and evaluate performance using metrics like F1 score and AUC.

Uploaded by

Praful Yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views

Code File

The document outlines a Vision Transformer-based system for copy-move forgery detection (CMFD) using synthetic dataset generation and a multi-modal attention model. It includes classes for generating synthetic images, a dataset class, model architecture, training utilities, and a Gradio interface for predictions. The system is designed to train on generated data and evaluate performance using metrics like F1 score and AUC.

Uploaded by

Praful Yadav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

"""

Vision Transformer-Based CMFD System


Author: AI Assistant (2023)
License: MIT
"""

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import cv2
import albumentations as A
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, roc_auc_score
import timm
import gradio as gr

# --- Synthetic Dataset Generation ---


class SyntheticCMFDGenerator:
def __init__(self, img_size=224):
self.img_size = img_size
self.backgrounds = [np.random.rand(224,224,3) for _ in range(100)] # Mock data

def create_forgery(self):
# 1. Random background
bg = self.backgrounds[np.random.randint(0,100)]

# 2. Copy-move operation
obj = bg[50:150, 50:150].copy()

# 3. Apply transformations
transform = A.Compose([
A.Rotate(limit=45, p=0.7),
A.RandomScale(scale_limit=0.2, p=0.5),
A.GaussianBlur(p=0.3)
])
transformed = transform(image=obj)['image']

# 4. Paste back
mask = np.zeros((224,224), dtype=np.float32)
x, y = np.random.randint(0,100), np.random.randint(0,100)
bg[y:y+100, x:x+100] = transformed
mask[y:y+100, x:x+100] = 1.0

return (bg*255).astype(np.uint8), mask

# --- Dataset Class ---


class CMFDDataset(Dataset):
def __init__(self, generator, num_samples=1000, transform=None):
self.generator = generator
self.num_samples = num_samples
self.transform = transform

def __len__(self):
return self.num_samples

def __getitem__(self, idx):


# Generate synthetic sample
img, mask = self.generator.create_forgery()

# Augmentations
if self.transform:
augmented = self.transform(image=img, mask=mask)
img, mask = augmented['image'], augmented['mask']

# Convert to tensors
img_rgb = torch.FloatTensor(img).permute(2,0,1)/255.0
img_lab = torch.FloatTensor(cv2.cvtColor(img,
cv2.COLOR_RGB2LAB)).permute(2,0,1)/255.0
edges = torch.FloatTensor(cv2.Canny(img, 100, 200))[None,:,:]/255.0
mask = torch.FloatTensor(mask)[None,:,:]

return {'rgb': img_rgb, 'lab': img_lab, 'edges': edges, 'mask': mask}

# --- Model Architecture ---


class MultiModalAttentionCMFD(nn.Module):
def __init__(self):
super().__init__()

# Vision Transformer Branches


self.vit_rgb = timm.create_model('vit_base_patch16_224', pretrained=True,
num_classes=0)
self.vit_lab = timm.create_model('vit_base_patch16_224', pretrained=True,
num_classes=0)
# Edge Feature Extractor
self.edge_net = nn.Sequential(
nn.Conv2d(1, 64, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2)
)

# Cross-Modal Attention
self.cross_attn = nn.MultiheadAttention(embed_dim=768, num_heads=8, batch_first=True)

# Decoder
self.decoder = nn.Sequential(
nn.ConvTranspose2d(768, 256, 4, stride=2),
nn.ReLU(),
nn.ConvTranspose2d(256, 128, 4, stride=2),
nn.ReLU(),
nn.Conv2d(128, 1, 1),
nn.Sigmoid()
)

def forward(self, rgb, lab, edges):


# Extract features
rgb_feats = self.vit_rgb(rgb)
lab_feats = self.vit_lab(lab)
edge_feats = self.edge_net(edges)

# Reshape for attention (B, N, C)


edge_feats = edge_feats.flatten(2).permute(0,2,1)

# Cross-modal attention
attn_out, _ = self.cross_attn(
query=rgb_feats,
key=lab_feats,
value=edge_feats
)

# Decode
B, N, C = attn_out.shape
h = w = int(N**0.5)
attn_out = attn_out.permute(0,2,1).view(B, C, h, w)
return self.decoder(attn_out)
# --- Training Utilities ---
class CMFDTrainer:
def __init__(self):
# Initialize components
self.generator = SyntheticCMFDGenerator()
self.transform = A.Compose([
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.3)
])

# Dataset & Loaders


self.train_set = CMFDDataset(self.generator, 1000, self.transform)
self.val_set = CMFDDataset(self.generator, 200)
self.train_loader = DataLoader(self.train_set, batch_size=8, shuffle=True)
self.val_loader = DataLoader(self.val_set, batch_size=8)

# Model & Optimizer


self.model = MultiModalAttentionCMFD()
self.optimizer = optim.AdamW(self.model.parameters(), lr=3e-5)
self.criterion = nn.BCELoss()

def dice_loss(self, pred, target):


smooth = 1e-5
intersection = (pred * target).sum()
return 1 - (2. * intersection + smooth) / (pred.sum() + target.sum() + smooth)

def train_epoch(self):
self.model.train()
total_loss = 0.0

for batch in self.train_loader:


self.optimizer.zero_grad()

outputs = self.model(
batch['rgb'],
batch['lab'],
batch['edges']
)

loss = 0.7*self.dice_loss(outputs, batch['mask']) + 0.3*self.criterion(outputs,


batch['mask'])
loss.backward()
self.optimizer.step()

total_loss += loss.item()

return total_loss/len(self.train_loader)

def validate(self):
self.model.eval()
preds, targets = [], []

with torch.no_grad():
for batch in self.val_loader:
outputs = self.model(
batch['rgb'],
batch['lab'],
batch['edges']
)
preds.extend(outputs.cpu().numpy().flatten())
targets.extend(batch['mask'].cpu().numpy().flatten())

return {
'f1': f1_score(targets, np.round(preds)),
'auc': roc_auc_score(targets, preds)
}

# --- Gradio Interface ---


def create_demo(model):
def predict(image):
# Preprocess
image = cv2.resize(image, (224, 224))
lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
edges = cv2.Canny(image, 100, 200)

# To tensor
rgb_tensor = torch.FloatTensor(image).permute(2,0,1)[None]/255.0
lab_tensor = torch.FloatTensor(lab).permute(2,0,1)[None]/255.0
edges_tensor = torch.FloatTensor(edges)[None,None]/255.0

# Predict
with torch.no_grad():
mask = model(rgb_tensor, lab_tensor, edges_tensor)

return (mask[0,0].numpy() > 0.5).astype(np.uint8)*255


return gr.Interface(
fn=predict,
inputs=gr.Image(label="Input Image"),
outputs=gr.Image(label="Forgery Mask"),
title="CMFD Detection Demo",
examples=[["sample1.jpg"], ["sample2.jpg"]] # Add real examples
)

# --- Main Execution ---


if __name__ == "__main__":
# Initialize system
trainer = CMFDTrainer()

# Training loop
for epoch in range(10):
train_loss = trainer.train_epoch()
val_metrics = trainer.validate()
print(f"Epoch {epoch+1}:")
print(f" Train Loss: {train_loss:.4f}")
print(f" Val F1: {val_metrics['f1']:.4f}, AUC: {val_metrics['auc']:.4f}")

# Launch demo
demo = create_demo(trainer.model)
demo.launch()

You might also like