code
code
import numpy as np
import pandas as pd
import re
import pyarrow
train = pd.read_csv('dataset/train.csv')
first_row = train.iloc[20]
print(first_row)
train.info()
train.entity_name.value_counts()
images_folder = 'images/train'
def extract_image_id(image_url):
return image_url.split('/')[-1].split('.')[0]
train['image_id'] = train['image_link'].apply(extract_image_id)
local_images = set(os.listdir(images_folder))
def clean_filename(filename):
def match_local_image(image_id):
if cleaned_id in normalized_local_images:
else:
return None
train['local_image_path'] = train['image_id'].apply(match_local_image)
train = train.dropna(subset=['local_image_path'])
train.head()
train.info()
class_counts = train['group_id'].value_counts()
train_filtered = train[~train['group_id'].isin(classes_to_remove)]
train_set.to_csv('train_split.csv', index=False)
test_set.to_csv('test_split.csv', index=False)
train_set = pd.read_csv('train_split.csv')
test_set = pd.read_csv('test_split.csv')
train_set.head()
def is_valid_image(img_path):
try:
img.load()
return False
return True
return False
except Exception as e:
return False
valid_train_set = train_set[train_set['local_image_path'].isin(valid_train_paths)]
valid_test_set = test_set[test_set['local_image_path'].isin(valid_test_paths)]
valid_train_set.to_csv('valid_train_split.csv', index=False)
valid_test_set.to_csv('valid_test_split.csv', index=False)
valid_train_set.info()
import numpy as np
import pandas as pd
try:
img = image.load_img(img_path, target_size=target_size)
img_array = image.img_to_array(img)
img_array = preprocess_input(img_array)
return img_array
except Exception as e:
return None
features = []
batch_paths = image_paths[i:i+batch_size]
batch_images = []
for p in batch_paths:
img_array = load_and_preprocess_image(p)
batch_images.append(img_array)
else:
if len(batch_images) > 0:
batch_images = np.vstack(batch_images)
features.extend(batch_features)
else:
features.extend([None] * len(batch_paths)) # Add None for failed images
return features
features = []
batch_paths = image_paths[i:i+batch_size]
features.extend(batch_features)
return features
# # Load the pre-trained ResNet50 model without the top layer (fully connected layers)
image_paths = valid_train_set['local_image_path'].tolist()
# Extract features
features_df = pd.DataFrame({
'local_image_path': image_paths,
})
# Validate feature sizes (2048) and remove invalid entries
valid_train_set.to_csv('train_features.csv', index=False)
df = pd.read_csv('train_features.csv')
print(df.head())
print(df.shape)
print(df.dtypes)
def convert_to_array(s):
try:
except:
df['features'] = df['features'].apply(convert_to_array)
print(df['features'].apply(type).value_counts())
print(df['features'].iloc[0])
df.to_parquet('train_features.parquet', index=False)
test_df = pd.read_csv('valid_test_split.csv')
# Assuming you have an image paths list from the test DataFrame
test_image_paths = test_df['local_image_path'].tolist()
test_features_df = pd.DataFrame({
'local_image_path': test_image_paths,
})
# Validate feature sizes (2048) and remove invalid entries
print(test_df.head())
print(test_df.shape)
print(test_df.dtypes)
print(test_df['features'].apply(type).value_counts())
if (test_df['feature_length'] != 2048).any():
else:
print(invalid_features)
test_df.to_parquet('test_features.parquet', index=False)
train_labels = pd.read_csv('/dataset/train.csv')
# Inspect the first few rows to understand its structure
train_labels.info()
train_features = pd.read_parquet('train_features.parquet')
test_features = pd.read_parquet('test_features.parquet')
train_features.info(), test_features.head()
merged_df = merged_df.drop(columns=['entity_value_y'])
merged_df.info()
print(merged_df['features'].apply(type).value_counts())
def clean_entity_value(value):
if len(numbers) > 1:
elif numbers:
avg_value = float(numbers[0])
else:
avg_value = None
return avg_value