import pandas as pd
# Load the data
df = pd.read_csv('loyalty.csv')
# 1. Identify and replace missing values
# Replace missing 'spend' values with 0
df['spend'].fillna(0, inplace=True)
# Ensure to replace missing values for other relevant fields as specified
df['first_month'].fillna(0, inplace=True)
df['items_in_first_month'].fillna(0, inplace=True)
df['region'].fillna('Unknown', inplace=True) # Missing regions
replaced with 'Unknown'
df['loyalty_years'].fillna('0-1', inplace=True) # Missing loyalty
years replaced with '0-1'
df['joining_month'].fillna('Unknown', inplace=True) # Missing joining
month replaced with 'Unknown'
df['promotion'].fillna('No', inplace=True) # Missing promotion
values replaced with 'No'
# 2. Convert values between data types
# Convert numeric fields from string (if necessary) ensuring type (including
rounding)
df['spend'] = pd.to_numeric(df['spend'], errors='coerce').fillna(0).round(2) #
Convert 'spend' to float and round to 2 decimals
df['first_month'] = pd.to_numeric(df['first_month'],
errors='coerce').fillna(0).round(2) # Convert 'first_month' to float and round
df['items_in_first_month'] = df['items_in_first_month'].fillna(0).astype(int) #
Convert 'items_in_first_month' from float to int
# 3. Clean categorical and text data by manipulating strings
# Convert categorical columns to the proper format
# For 'region' and 'joining_month', we can standardize string formats
df['region'] = df['region'].str.strip().str.title() # Strip whitespace and title
case for consistency
df['joining_month'] = df['joining_month'].str.strip().str.capitalize() # Strip
whitespace and capitalize
# Ensure all categorical features are of type 'category' for efficient memory usage
df['region'] = df['region'].astype('category')
df['loyalty_years'] = df['loyalty_years'].astype('category')
df['joining_month'] = df['joining_month'].astype('category')
df['promotion'] = df['promotion'].astype('category')
# Display the cleaned DataFrame
print(df.head())
# Saving cleaned data to a new CSV file if needed
df.to_csv('cleaned_loyalty.csv', index=False)