0% found this document useful (0 votes)
7 views

code

The document outlines a process for preparing and processing image data for machine learning, including loading datasets, cleaning image paths, validating image files, and extracting features using a pre-trained ResNet50 model. It involves splitting the dataset into training and testing sets, ensuring valid image formats, and saving the processed features into CSV and Parquet files. Additionally, it includes merging feature data with labels and cleaning numerical values from the dataset.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views

code

The document outlines a process for preparing and processing image data for machine learning, including loading datasets, cleaning image paths, validating image files, and extracting features using a pre-trained ResNet50 model. It involves splitting the dataset into training and testing sets, ensuring valid image formats, and saving the processed features into CSV and Parquet files. Additionally, it includes merging feature data with labels and cleaning numerical values from the dataset.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 11

import os

import numpy as np

import pandas as pd

import re

from keras.applications.resnet50 import ResNet50, preprocess_input

from keras.preprocessing import image

from keras.models import Model

import pyarrow

train = pd.read_csv('dataset/train.csv')

first_row = train.iloc[20]

print(first_row)

train.info()

train.entity_name.value_counts()

images_folder = 'images/train'

def extract_image_id(image_url):

return image_url.split('/')[-1].split('.')[0]

train['image_id'] = train['image_link'].apply(extract_image_id)

local_images = set(os.listdir(images_folder))

def clean_filename(filename):

return filename.split('.')[0].replace('+', '')


normalized_local_images = {clean_filename(image_file): image_file for image_file in local_images}

def match_local_image(image_id):

cleaned_id = clean_filename(image_id + '.jpg')

if cleaned_id in normalized_local_images:

return os.path.join(images_folder, normalized_local_images[cleaned_id])

else:

return None

train['local_image_path'] = train['image_id'].apply(match_local_image)

train = train.dropna(subset=['local_image_path'])

train.head()

train.info()

class_counts = train['group_id'].value_counts()

# Find classes with only one sample

classes_to_remove = class_counts[class_counts == 1].index

# Remove those classes from the dataset

train_filtered = train[~train['group_id'].isin(classes_to_remove)]

from sklearn.model_selection import train_test_split

# Perform train-test split with stratification


train_set, test_set = train_test_split(train_filtered, test_size=0.2, random_state=42,
stratify=train_filtered['group_id'])

# Save the split datasets for future use

train_set.to_csv('train_split.csv', index=False)

test_set.to_csv('test_split.csv', index=False)

train_set = pd.read_csv('train_split.csv')

test_set = pd.read_csv('test_split.csv')

train_set.head()

from PIL import Image

def is_valid_image(img_path):

try:

with Image.open(img_path) as img:

# Attempt to load the image data

img.load()

# Check for common issues with image files

if img.format not in ['JPEG', 'PNG']: # Add other formats if necessary

print(f"Unsupported image format: {img_path}")

return False

# If we successfully load the image and it has content, it's valid

return True

except (IOError, SyntaxError) as e:

# Handle errors related to loading or decoding the image


print(f"Error validating image {img_path}: {e}")

return False

except Exception as e:

# Catch any other exceptions

print(f"Unexpected error with image {img_path}: {e}")

return False

valid_train_paths = [path for path in train_set['local_image_path'] if is_valid_image(path)]

valid_test_paths = [path for path in test_set['local_image_path'] if is_valid_image(path)]

valid_train_set = train_set[train_set['local_image_path'].isin(valid_train_paths)]

valid_test_set = test_set[test_set['local_image_path'].isin(valid_test_paths)]

# Save the cleaned datasets

valid_train_set.to_csv('valid_train_split.csv', index=False)

valid_test_set.to_csv('valid_test_split.csv', index=False)

valid_train_set.info()

from tqdm import tqdm

import numpy as np

import pandas as pd

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

from tensorflow.keras.preprocessing import image

# Load and preprocess image

def load_and_preprocess_image(img_path, target_size=(224, 224)):

try:
img = image.load_img(img_path, target_size=target_size)

img_array = image.img_to_array(img)

img_array = np.expand_dims(img_array, axis=0)

img_array = preprocess_input(img_array)

return img_array

except Exception as e:

print(f"Error loading or preprocessing image {img_path}: {e}")

return None

# Extract features in batches

def extract_features_batch(image_paths, batch_size=32):

features = []

for i in range(0, len(image_paths), batch_size):

batch_paths = image_paths[i:i+batch_size]

batch_images = []

for p in batch_paths:

img_array = load_and_preprocess_image(p)

if img_array is not None:

batch_images.append(img_array)

else:

print(f"Skipping image {p} due to loading error.")

if len(batch_images) > 0:

batch_images = np.vstack(batch_images)

print(f"Batch images shape: {batch_images.shape}")

batch_features = base_model.predict(batch_images, verbose=0)

print(f"Batch features shape: {batch_features.shape}") # Ensure this shape is (batch_size,


2048)

features.extend(batch_features)

else:
features.extend([None] * len(batch_paths)) # Add None for failed images

return features

# Extract features with progress bar

def extract_features_batch_with_progress(image_paths, batch_size=32):

features = []

for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting features"):

batch_paths = image_paths[i:i+batch_size]

batch_features = extract_features_batch(batch_paths, batch_size=batch_size)

features.extend(batch_features)

return features

# # Load the pre-trained ResNet50 model without the top layer (fully connected layers)

base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Load image paths

image_paths = valid_train_set['local_image_path'].tolist()

# Extract features

features = extract_features_batch_with_progress(image_paths, batch_size=32)

# Convert features to DataFrame

features_df = pd.DataFrame({

'local_image_path': image_paths,

'features': [f.tolist() for f in features] # Convert numpy arrays to lists

})
# Validate feature sizes (2048) and remove invalid entries

features_df['feature_length'] = features_df['features'].apply(lambda x: len(x) if x is not None else


0)

valid_features_df = features_df[features_df['feature_length'] == 2048] # Keep only valid features

# Merge back with the original dataset

valid_train_set = valid_train_set.merge(valid_features_df[['local_image_path', 'features']],


on='local_image_path', how='inner')

# Save the dataset with valid features

valid_train_set.to_csv('train_features.csv', index=False)

print("Feature extraction completed successfully. Dataset saved.")

# # Load the dataset

df = pd.read_csv('train_features.csv')

# # Inspect the first few rows

print(df.head())

# # Check the dataset shape and column types

print(df.shape)

print(df.dtypes)

# Function to convert string representation to NumPy array

def convert_to_array(s):

try:

return np.array(eval(s), dtype=np.float32)

except:

return np.array([]) # Return empty array if there's an issue


# Apply the conversion function

df['features'] = df['features'].apply(convert_to_array)

# Verify data types

print(df['features'].apply(type).value_counts())

# Inspect a sample entry

print(df['features'].iloc[0])

# Check feature vector lengths

df['feature_length'] = df['features'].apply(lambda x: len(x) if x is not None else 0)

valid_features_df = df[df['feature_length'] == 2048]

print("Number of valid feature vectors:", valid_features_df.shape[0])

df.to_parquet('train_features.parquet', index=False)

test_df = pd.read_csv('valid_test_split.csv')

# Assuming you have an image paths list from the test DataFrame

test_image_paths = test_df['local_image_path'].tolist()

# Extract features for test images

test_features = extract_features_batch_with_progress(test_image_paths, batch_size=32)

test_features_df = pd.DataFrame({

'local_image_path': test_image_paths,

'features': [f.tolist() for f in test_features]

})
# Validate feature sizes (2048) and remove invalid entries

test_features_df['feature_length'] = test_features_df['features'].apply(lambda x: len(x) if x is not


None else 0)

valid_test_features_df = test_features_df[test_features_df['feature_length'] == 2048]

# Merge back with the test DataFrame

valid_test_set = valid_test_set.merge(valid_test_features_df[['local_image_path', 'features']],


on='local_image_path', how='inner')

# Drop the old feature columns

test_df = valid_test_set.drop(columns=['features_x', 'features_y'])

# Check the DataFrame to ensure columns are removed

print(test_df.head())

print(test_df.shape)

print(test_df.dtypes)

print(test_df['features'].apply(type).value_counts())

test_df['feature_length'] = test_df['features'].apply(lambda x: len(x) if isinstance(x, list) else 0)

if (test_df['feature_length'] != 2048).any():

print("Warning: Some feature vectors do not have 2048 dimensions.")

else:

print("All feature vectors have 2048 dimensions.")

invalid_features = test_df[test_df['feature_length'] != 2048]

print(invalid_features)

test_df.to_parquet('test_features.parquet', index=False)

train_labels = pd.read_csv('/dataset/train.csv')
# Inspect the first few rows to understand its structure

train_labels.info()

# Load the features

train_features = pd.read_parquet('train_features.parquet')

test_features = pd.read_parquet('test_features.parquet')

# Inspect the first few rows of features

train_features.info(), test_features.head()

merged_df = pd.merge(train_features, train_labels, on=['image_link', 'group_id', 'entity_name'],


how='inner')

merged_df = merged_df.drop(columns=['entity_value_y'])

merged_df.info()

print(merged_df['features'].apply(type).value_counts())

# Assuming `merged_df` is your dataframe with a column 'entity_value'

# Function to clean entity_value and extract float values

def clean_entity_value(value):

# Split by common separators like '-', 'to', ','

value = value.lower().replace('to', '-').replace(',', '-')

# Extract numbers (including ranges)

numbers = re.findall(r"[-+]?\d*\.\d+|\d+", value)

if len(numbers) > 1:

# Handle ranges by averaging the two values


numbers = [float(num) for num in numbers]

avg_value = sum(numbers) / len(numbers)

elif numbers:

avg_value = float(numbers[0])

else:

avg_value = None

return avg_value

You might also like