0% found this document useful (0 votes)

24 views11 pages

Code

The document outlines a process for preparing and processing image data for machine learning, including loading datasets, cleaning image paths, validating image files, and extracting features using a pre-trained ResNet50 model. It involves splitting the dataset into training and testing sets, ensuring valid image formats, and saving the processed features into CSV and Parquet files. Additionally, it includes merging feature data with labels and cleaning numerical values from the dataset.

Uploaded by

experimentprojects09

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

24 views11 pages

Code

Uploaded by

experimentprojects09

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 11

import os

import numpy as np

import pandas as pd

import re

from keras.applications.resnet50 import ResNet50, preprocess_input

from keras.preprocessing import image

from keras.models import Model

import pyarrow

train = pd.read_csv('dataset/train.csv')

first_row = train.iloc[20]

print(first_row)

train.info()

train.entity_name.value_counts()

images_folder = 'images/train'

def extract_image_id(image_url):

return image_url.split('/')[-1].split('.')[0]

train['image_id'] = train['image_link'].apply(extract_image_id)

local_images = set(os.listdir(images_folder))

def clean_filename(filename):

return filename.split('.')[0].replace('+', '')

normalized_local_images = {clean_filename(image_file): image_file for image_file in local_images}

def match_local_image(image_id):

cleaned_id = clean_filename(image_id + '.jpg')

if cleaned_id in normalized_local_images:

return os.path.join(images_folder, normalized_local_images[cleaned_id])

else:

return None

train['local_image_path'] = train['image_id'].apply(match_local_image)

train = train.dropna(subset=['local_image_path'])

train.head()

train.info()

class_counts = train['group_id'].value_counts()

# Find classes with only one sample

classes_to_remove = class_counts[class_counts == 1].index

# Remove those classes from the dataset

train_filtered = train[~train['group_id'].isin(classes_to_remove)]

from sklearn.model_selection import train_test_split

# Perform train-test split with stratification

train_set, test_set = train_test_split(train_filtered, test_size=0.2, random_state=42,
stratify=train_filtered['group_id'])

# Save the split datasets for future use

train_set.to_csv('train_split.csv', index=False)

test_set.to_csv('test_split.csv', index=False)

train_set = pd.read_csv('train_split.csv')

test_set = pd.read_csv('test_split.csv')

train_set.head()

from PIL import Image

def is_valid_image(img_path):

try:

with Image.open(img_path) as img:

# Attempt to load the image data

img.load()

# Check for common issues with image files

if img.format not in ['JPEG', 'PNG']: # Add other formats if necessary

print(f"Unsupported image format: {img_path}")

return False

# If we successfully load the image and it has content, it's valid

return True

except (IOError, SyntaxError) as e:

# Handle errors related to loading or decoding the image

print(f"Error validating image {img_path}: {e}")

return False

except Exception as e:

# Catch any other exceptions

print(f"Unexpected error with image {img_path}: {e}")

return False

valid_train_paths = [path for path in train_set['local_image_path'] if is_valid_image(path)]

valid_test_paths = [path for path in test_set['local_image_path'] if is_valid_image(path)]

valid_train_set = train_set[train_set['local_image_path'].isin(valid_train_paths)]

valid_test_set = test_set[test_set['local_image_path'].isin(valid_test_paths)]

# Save the cleaned datasets

valid_train_set.to_csv('valid_train_split.csv', index=False)

valid_test_set.to_csv('valid_test_split.csv', index=False)

valid_train_set.info()

from tqdm import tqdm

import numpy as np

import pandas as pd

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

from tensorflow.keras.preprocessing import image

# Load and preprocess image

def load_and_preprocess_image(img_path, target_size=(224, 224)):

try:
img = image.load_img(img_path, target_size=target_size)

img_array = image.img_to_array(img)

img_array = np.expand_dims(img_array, axis=0)

img_array = preprocess_input(img_array)

return img_array

except Exception as e:

print(f"Error loading or preprocessing image {img_path}: {e}")

return None

# Extract features in batches

def extract_features_batch(image_paths, batch_size=32):

features = []

for i in range(0, len(image_paths), batch_size):

batch_paths = image_paths[i:i+batch_size]

batch_images = []

for p in batch_paths:

img_array = load_and_preprocess_image(p)

if img_array is not None:

batch_images.append(img_array)

else:

print(f"Skipping image {p} due to loading error.")

if len(batch_images) > 0:

batch_images = np.vstack(batch_images)

print(f"Batch images shape: {batch_images.shape}")

batch_features = base_model.predict(batch_images, verbose=0)

print(f"Batch features shape: {batch_features.shape}") # Ensure this shape is (batch_size,

2048)

features.extend(batch_features)

else:
features.extend([None] * len(batch_paths)) # Add None for failed images

return features

# Extract features with progress bar

def extract_features_batch_with_progress(image_paths, batch_size=32):

features = []

for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting features"):

batch_paths = image_paths[i:i+batch_size]

batch_features = extract_features_batch(batch_paths, batch_size=batch_size)

features.extend(batch_features)

return features

# # Load the pre-trained ResNet50 model without the top layer (fully connected layers)

base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Load image paths

image_paths = valid_train_set['local_image_path'].tolist()

# Extract features

features = extract_features_batch_with_progress(image_paths, batch_size=32)

# Convert features to DataFrame

features_df = pd.DataFrame({

'local_image_path': image_paths,

'features': [f.tolist() for f in features] # Convert numpy arrays to lists

})
# Validate feature sizes (2048) and remove invalid entries

features_df['feature_length'] = features_df['features'].apply(lambda x: len(x) if x is not None else

valid_features_df = features_df[features_df['feature_length'] == 2048] # Keep only valid features

# Merge back with the original dataset

valid_train_set = valid_train_set.merge(valid_features_df[['local_image_path', 'features']],

on='local_image_path', how='inner')

# Save the dataset with valid features

valid_train_set.to_csv('train_features.csv', index=False)

print("Feature extraction completed successfully. Dataset saved.")

# # Load the dataset

df = pd.read_csv('train_features.csv')

# # Inspect the first few rows

print(df.head())

# # Check the dataset shape and column types

print(df.shape)

print(df.dtypes)

# Function to convert string representation to NumPy array

def convert_to_array(s):

try:

return np.array(eval(s), dtype=np.float32)

except:

return np.array([]) # Return empty array if there's an issue

# Apply the conversion function

df['features'] = df['features'].apply(convert_to_array)

# Verify data types

print(df['features'].apply(type).value_counts())

# Inspect a sample entry

print(df['features'].iloc[0])

# Check feature vector lengths

df['feature_length'] = df['features'].apply(lambda x: len(x) if x is not None else 0)

valid_features_df = df[df['feature_length'] == 2048]

print("Number of valid feature vectors:", valid_features_df.shape[0])

df.to_parquet('train_features.parquet', index=False)

test_df = pd.read_csv('valid_test_split.csv')

# Assuming you have an image paths list from the test DataFrame

test_image_paths = test_df['local_image_path'].tolist()

# Extract features for test images

test_features = extract_features_batch_with_progress(test_image_paths, batch_size=32)

test_features_df = pd.DataFrame({

'local_image_path': test_image_paths,

'features': [f.tolist() for f in test_features]

})
# Validate feature sizes (2048) and remove invalid entries

test_features_df['feature_length'] = test_features_df['features'].apply(lambda x: len(x) if x is not

None else 0)

valid_test_features_df = test_features_df[test_features_df['feature_length'] == 2048]

# Merge back with the test DataFrame

valid_test_set = valid_test_set.merge(valid_test_features_df[['local_image_path', 'features']],

on='local_image_path', how='inner')

# Drop the old feature columns

test_df = valid_test_set.drop(columns=['features_x', 'features_y'])

# Check the DataFrame to ensure columns are removed

print(test_df.head())

print(test_df.shape)

print(test_df.dtypes)

print(test_df['features'].apply(type).value_counts())

test_df['feature_length'] = test_df['features'].apply(lambda x: len(x) if isinstance(x, list) else 0)

if (test_df['feature_length'] != 2048).any():

print("Warning: Some feature vectors do not have 2048 dimensions.")

else:

print("All feature vectors have 2048 dimensions.")

invalid_features = test_df[test_df['feature_length'] != 2048]

print(invalid_features)

test_df.to_parquet('test_features.parquet', index=False)

train_labels = pd.read_csv('/dataset/train.csv')
# Inspect the first few rows to understand its structure

train_labels.info()

# Load the features

train_features = pd.read_parquet('train_features.parquet')

test_features = pd.read_parquet('test_features.parquet')

# Inspect the first few rows of features

train_features.info(), test_features.head()

merged_df = pd.merge(train_features, train_labels, on=['image_link', 'group_id', 'entity_name'],

how='inner')

merged_df = merged_df.drop(columns=['entity_value_y'])

merged_df.info()

print(merged_df['features'].apply(type).value_counts())

# Assuming `merged_df` is your dataframe with a column 'entity_value'

# Function to clean entity_value and extract float values

def clean_entity_value(value):

# Split by common separators like '-', 'to', ','

value = value.lower().replace('to', '-').replace(',', '-')

# Extract numbers (including ranges)

numbers = re.findall(r"[-+]?\d*\.\d+|\d+", value)

if len(numbers) > 1:

# Handle ranges by averaging the two values

numbers = [float(num) for num in numbers]

avg_value = sum(numbers) / len(numbers)

elif numbers:

avg_value = float(numbers[0])

else:

avg_value = None

return avg_value

DLV Lab Manual Print
No ratings yet
DLV Lab Manual Print
29 pages
Deep Learning Practical Assignment:: Q-1) Code
No ratings yet
Deep Learning Practical Assignment:: Q-1) Code
59 pages
Radiation Protection in Medical Radiography 9th Edition Sherer Solution Manual Full Download
100% (2)
Radiation Protection in Medical Radiography 9th Edition Sherer Solution Manual Full Download
411 pages
Experiment 1 Photocell
83% (6)
Experiment 1 Photocell
6 pages
CNN Image Classification - Real Time Food Recognition
No ratings yet
CNN Image Classification - Real Time Food Recognition
25 pages
Brain Tumor Multi-Classification With PSO: Import As Import As Import
No ratings yet
Brain Tumor Multi-Classification With PSO: Import As Import As Import
18 pages
Source Code
No ratings yet
Source Code
10 pages
Program
No ratings yet
Program
10 pages
Content
No ratings yet
Content
27 pages
Potato
No ratings yet
Potato
16 pages
Skin Disease Detection Using Transformers
No ratings yet
Skin Disease Detection Using Transformers
35 pages
Pattern Recognition Lab
No ratings yet
Pattern Recognition Lab
24 pages
Wild Fire CNN Accuracy 95
No ratings yet
Wild Fire CNN Accuracy 95
15 pages
The Influence of Cultural Background On Language Anxiety Among Malaysian ESL Tertiary Learners
No ratings yet
The Influence of Cultural Background On Language Anxiety Among Malaysian ESL Tertiary Learners
12 pages
Tensorflow and Keras Apis: 0.1 Computer Vision: Neural Networks and Deep Learning
No ratings yet
Tensorflow and Keras Apis: 0.1 Computer Vision: Neural Networks and Deep Learning
32 pages
Topic 4: Subject-Verb Agreement I. Choose The Best Answer To Finish Each Sentence
No ratings yet
Topic 4: Subject-Verb Agreement I. Choose The Best Answer To Finish Each Sentence
15 pages
Project Guidelines - AIML
No ratings yet
Project Guidelines - AIML
30 pages
Transfer Learning Q3 2
No ratings yet
Transfer Learning Q3 2
36 pages
Deep Learning Experiments
No ratings yet
Deep Learning Experiments
42 pages
Cengizhan Sahin
No ratings yet
Cengizhan Sahin
26 pages
Politics of Development and Underdevelop
No ratings yet
Politics of Development and Underdevelop
310 pages
PCa $ Image Processing
No ratings yet
PCa $ Image Processing
8 pages
Ballclassification
No ratings yet
Ballclassification
6 pages
Here Are Common Image Preprocessing Techniques Used in Machine Learning and Deep Learning
No ratings yet
Here Are Common Image Preprocessing Techniques Used in Machine Learning and Deep Learning
7 pages
Presentation
No ratings yet
Presentation
31 pages
Srafvana
No ratings yet
Srafvana
6 pages
Corn Det
No ratings yet
Corn Det
2 pages
TMA01 Question 2 (55 Marks)
No ratings yet
TMA01 Question 2 (55 Marks)
26 pages
Python Machine Learning Projects
No ratings yet
Python Machine Learning Projects
70 pages
1729492946538
No ratings yet
1729492946538
10 pages
Cornell Notes Template
No ratings yet
Cornell Notes Template
1 page
导入所需库
No ratings yet
导入所需库
20 pages
A 1
No ratings yet
A 1
9 pages
Untitled 0
No ratings yet
Untitled 0
3 pages
CRATERS Entire Program
No ratings yet
CRATERS Entire Program
5 pages
Appix
No ratings yet
Appix
15 pages
Brain Tumor Classification Using Vision Transformer (Vit) : Import As Import As
No ratings yet
Brain Tumor Classification Using Vision Transformer (Vit) : Import As Import As
11 pages
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
From Everand
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
Abdelfattah Ragab
No ratings yet
Player Survival Guide v1.2
No ratings yet
Player Survival Guide v1.2
44 pages
Skill4 2100100003
No ratings yet
Skill4 2100100003
5 pages
Process, Assumptions, Values N Beliefs of OD
100% (2)
Process, Assumptions, Values N Beliefs of OD
21 pages
AFS Foundations Handbook - Final
No ratings yet
AFS Foundations Handbook - Final
41 pages
Skin PRJ
No ratings yet
Skin PRJ
5 pages
Fatima Binte Aqeel AI Theory Assignment 3
No ratings yet
Fatima Binte Aqeel AI Theory Assignment 3
7 pages
Performance Testing
No ratings yet
Performance Testing
15 pages
Potato Disease Classification Using CNN
No ratings yet
Potato Disease Classification Using CNN
21 pages
DL 6
No ratings yet
DL 6
3 pages
Curvature
No ratings yet
Curvature
2 pages
Brain Tumour Classification
No ratings yet
Brain Tumour Classification
10 pages
10 Lessons in Front-end
From Everand
10 Lessons in Front-end
Krasimir Tsonev
2/5 (1)
Not F: # Check If The File Exists
No ratings yet
Not F: # Check If The File Exists
7 pages
Training Code
No ratings yet
Training Code
4 pages
ARTS M 215 Course Pack
No ratings yet
ARTS M 215 Course Pack
41 pages
Activity-2 DL
No ratings yet
Activity-2 DL
8 pages
CV Assignment 2 Group02
No ratings yet
CV Assignment 2 Group02
12 pages
Case Study - AP23322130042
No ratings yet
Case Study - AP23322130042
7 pages
Algorithm
No ratings yet
Algorithm
9 pages
Apex For Bres 1
No ratings yet
Apex For Bres 1
6 pages
DL Practical 6,7 Outputs
No ratings yet
DL Practical 6,7 Outputs
9 pages
DAILY LESSON LOG OF STEM - PC11AG-Ib-1 (Week Two-Day One) : 4 Cy 4cy
No ratings yet
DAILY LESSON LOG OF STEM - PC11AG-Ib-1 (Week Two-Day One) : 4 Cy 4cy
4 pages
CP Cooler Technik
No ratings yet
CP Cooler Technik
16 pages
Introduction of Water Treatment at Sinza A Ward
No ratings yet
Introduction of Water Treatment at Sinza A Ward
22 pages
Source Code
No ratings yet
Source Code
3 pages
CNN TF Keras
No ratings yet
CNN TF Keras
6 pages
Ey Parthenon Ficci Report Transformation of Indian Higher Education Strategies To Leapfrog
No ratings yet
Ey Parthenon Ficci Report Transformation of Indian Higher Education Strategies To Leapfrog
60 pages
Lab 4-Image Segmentation Using U-Net
No ratings yet
Lab 4-Image Segmentation Using U-Net
9 pages
Flower CNN
No ratings yet
Flower CNN
7 pages
Stuff
No ratings yet
Stuff
2 pages
Plant Disease Detection Using Keras
No ratings yet
Plant Disease Detection Using Keras
8 pages
Raw Nitex
No ratings yet
Raw Nitex
5 pages
Data Sheet: Analog Meters With Two Moving-Iron Movements WQ 96 /2S WQ 144 /2S
No ratings yet
Data Sheet: Analog Meters With Two Moving-Iron Movements WQ 96 /2S WQ 144 /2S
4 pages
Tensorflow Neural Network Lab: Notmnist
No ratings yet
Tensorflow Neural Network Lab: Notmnist
15 pages
DFPC Fire Instructor I NFPA 1041 2007
No ratings yet
DFPC Fire Instructor I NFPA 1041 2007
10 pages
Assignment 6.3
No ratings yet
Assignment 6.3
2 pages
Amazon ML Challenge 2024 Solution by Vishesh Rawal Sep, 2024 Medium
No ratings yet
Amazon ML Challenge 2024 Solution by Vishesh Rawal Sep, 2024 Medium
15 pages
Rules and Procedures of Solving Mathematical Problems
No ratings yet
Rules and Procedures of Solving Mathematical Problems
17 pages
ASME B31J B31J Essentials Why These Are Useful in Piping Stress Analysis
No ratings yet
ASME B31J B31J Essentials Why These Are Useful in Piping Stress Analysis
4 pages
Assignment 2.4.1 Multiclass Classification
No ratings yet
Assignment 2.4.1 Multiclass Classification
5 pages
PRY 6 English Ist Term
No ratings yet
PRY 6 English Ist Term
17 pages
IALC
No ratings yet
IALC
9 pages
Image Processing
No ratings yet
Image Processing
5 pages
5 Data Science Resolutions For 2025
No ratings yet
5 Data Science Resolutions For 2025
9 pages
Project Description
No ratings yet
Project Description
4 pages
Scholarship Program in Data Sciences For Social Impact For Working Professionals
No ratings yet
Scholarship Program in Data Sciences For Social Impact For Working Professionals
18 pages
2SC5200/FJL4315 NPN Epitaxial Silicon Transistor: Applications
No ratings yet
2SC5200/FJL4315 NPN Epitaxial Silicon Transistor: Applications
7 pages
TensorFlow深度学习项目实战: Chinese Edition
From Everand
TensorFlow深度学习项目实战: Chinese Edition
Posts & Telecom Press
No ratings yet
Wind Power Statistics and An Evaluation of Wind Energy D - 1995 - Renewable Ener
No ratings yet
Wind Power Statistics and An Evaluation of Wind Energy D - 1995 - Renewable Ener
6 pages
Module No. 2
No ratings yet
Module No. 2
21 pages
Inventatiinent 1
No ratings yet
Inventatiinent 1
1 page
Document 1
No ratings yet
Document 1
1 page
A Food Product Also Has An Ingredie
No ratings yet
A Food Product Also Has An Ingredie
1 page
Unit 2 Nis QB
No ratings yet
Unit 2 Nis QB
1 page
WhatsApp Image 2025-02-05 at 1.49.50 AM
No ratings yet
WhatsApp Image 2025-02-05 at 1.49.50 AM
1 page
Leadership Theories: MBA 645 Leadership in Organizations Jeff Shay University of Montana
No ratings yet
Leadership Theories: MBA 645 Leadership in Organizations Jeff Shay University of Montana
16 pages
Aimil Ist Lot Delivery
No ratings yet
Aimil Ist Lot Delivery
2 pages
Block Wise Sub Allocation NFSM CC (Jute) 24-25
No ratings yet
Block Wise Sub Allocation NFSM CC (Jute) 24-25
1 page
Determinants of Work-Readiness: Siti Nurlaela Kurjono Rasto
No ratings yet
Determinants of Work-Readiness: Siti Nurlaela Kurjono Rasto
7 pages
Sound Simulation-Based Design Optimization of Brass Wind
No ratings yet
Sound Simulation-Based Design Optimization of Brass Wind
11 pages
Ocb & LMX
No ratings yet
Ocb & LMX
12 pages
2021sem3 PHSH CC6
No ratings yet
2021sem3 PHSH CC6
2 pages