0% found this document useful (0 votes)
10 views

Main.py Text File

The document is a Python script that performs data analysis, cleaning, and visualization using libraries such as pandas, matplotlib, and seaborn. It includes functions for analyzing data columns, cleaning missing values, identifying critical columns, generating tags from text, and creating various visualizations. The script also manages directories for saving visualizations and cleaned data, and it handles potential errors during execution.

Uploaded by

vikhepa
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

Main.py Text File

The document is a Python script that performs data analysis, cleaning, and visualization using libraries such as pandas, matplotlib, and seaborn. It includes functions for analyzing data columns, cleaning missing values, identifying critical columns, generating tags from text, and creating various visualizations. The script also manages directories for saving visualizations and cleaned data, and it handles potential errors during execution.

Uploaded by

vikhepa
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

import pandas as pd

import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import os

# Set a global seaborn style


sns.set(style="whitegrid", palette="coolwarm") # You can also try 'darkgrid',
'ticks', etc.

# Update font style globally


plt.rcParams.update({'font.size': 12, 'font.family': 'serif'}) # Set font size and
family globally

# Create the directory for visualizations


visualizations_directory = 'visualizations/'
if not os.path.exists(visualizations_directory):
os.makedirs(visualizations_directory)

# Create the directory for cleaned data


cleaned_data_directory = 'cleaned_data_files/'
if not os.path.exists(cleaned_data_directory):
os.makedirs(cleaned_data_directory)

def column_wise_analysis(data):
"""
Analyzes columns for data types, unique values, and numerical summaries.
"""
print("Data Types:\n", data.dtypes)

for column in data.columns:


print(f"\nColumn: {column}")
print(f"Unique Values: {data[column].nunique()}")
if data[column].dtype in ['float64', 'int64']:
print(f"Summary:\n{data[column].describe()}\n")
else:
print(f"Value Counts:\n{data[column].value_counts().head(5)}\n")

def data_cleaning(data):
"""
Cleans the dataset by handling missing values and inconsistencies.
"""
# Handle Missing Values
for column in data.select_dtypes(include=['float64', 'int64']).columns:
if data[column].isnull().any():
data[column] = data[column].fillna(data[column].mean()) # Fill
numerical NaNs with the column mean

for column in data.select_dtypes(include=['object']).columns:


if data[column].isnull().any():
data[column] = data[column].fillna(data[column].mode()[0]) # Fill
categorical NaNs with the mode

# Ensure uniformity in categorical columns (convert to lowercase)


for column in data.select_dtypes(include=['object']).columns:
data[column] = data[column].str.lower()

# Handle outliers (optional, you can adjust the thresholds or remove outliers
if needed)
for column in data.select_dtypes(include=['float64', 'int64']).columns:
upper_limit = data[column].mean() + 3 * data[column].std()
lower_limit = data[column].mean() - 3 * data[column].std()
data[column] = data[column].clip(lower=lower_limit, upper=upper_limit) #
Clip outliers to within limits

return data

def identify_critical_columns(data):
"""
Identifies and visualizes critical columns in the dataset.
"""
print("Columns in the dataset:", data.columns) # Debugging: Print the columns

# Update the critical columns list based on your dataset


critical_columns = [col for col in ['TOTALCOST', 'REPORTING_COST', 'LBRCOST',
'KM'] if col in data.columns]

if not critical_columns:
print("No critical columns found in the dataset.")
return

print("\nReasoning for Selected Critical Columns:") # Reasoning for selection


reasoning = {
'TOTALCOST': "Total expenditure; critical for profitability analysis.",
'REPORTING_COST': "Relevant for tracking costs associated with repairs.",
'LBRCOST': "Labor costs involved in repairs.",
'KM': "Kilometers, relevant for repair frequency analysis."
}

for col in critical_columns:


print(f"{col.upper()}: {reasoning.get(col, 'No reasoning available for this
column')}")

# Visualize the critical column (bar plot)


plt.figure(figsize=(10, 6))
data[col].plot(kind='bar', color='skyblue')
plt.title(f'{col.upper()} Bar Plot')
plt.xlabel('Index')
plt.ylabel(col.upper())
plt.tight_layout()
plt.savefig(f'{visualizations_directory}{col}_barplot.png') # Save plot as
image file
plt.show()

def generate_tags(data, text_column='CAUSAL_PART_NM'):


"""
Generates tags/features from free text in the dataset (if applicable).
"""
if text_column in data.columns:
vectorizer = CountVectorizer(stop_words='english', max_features=10)
data[text_column] = data[text_column].fillna('') # Replace NaNs with empty
strings for text processing
tags = vectorizer.fit_transform(data[text_column]) # Generate tag features
print("\nGenerated Tags:", vectorizer.get_feature_names_out()) # Display
top tags
else:
print(f"\nColumn '{text_column}' not found. Skipping tag generation.")

def visualize_data(data):
"""
Create various visualizations for the dataset.
"""
# Bar Charts for categorical data
categorical_columns = ['CAUSAL_PART_NM', 'PLATFORM', 'BODY_STYLE',
'DEALER_NAME', 'STATE']
for col in categorical_columns:
if col in data.columns:
plt.figure(figsize=(10, 6))
data[col].value_counts().plot(kind='bar', color='skyblue')
plt.title(f'{col} Distribution')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}{col}_barchart.png', dpi=300)
# Save with high resolution
plt.show()

# Line Chart for time-series data (REPAIR_DATE)


if 'REPAIR_DATE' in data.columns:
plt.figure(figsize=(10, 6))
data['REPAIR_DATE'] = pd.to_datetime(data['REPAIR_DATE'], errors='coerce')
# Convert to datetime

data.groupby(data['REPAIR_DATE'].dt.to_period('M')).size().plot(kind='line',
marker='o', color='orange')
plt.title('Repairs Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Repairs')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}repair_date_linechart.png',
dpi=300)
plt.show()

# Scatter Plot for KM vs TOTALCOST


if 'KM' in data.columns and 'TOTALCOST' in data.columns:
plt.figure(figsize=(10, 6))
plt.scatter(data['KM'], data['TOTALCOST'], color='green')
plt.title('Scatter Plot: KM vs TOTALCOST')
plt.xlabel('Kilometers')
plt.ylabel('Total Cost')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}km_vs_totalcost_scatterplot.png',
dpi=300)
plt.show()

# Histogram for the Distribution of REPAIR_AGE


if 'REPAIR_AGE' in data.columns:
plt.figure(figsize=(10, 6))
data['REPAIR_AGE'].plot(kind='hist', bins=30, edgecolor='black',
color='purple')
plt.title('Distribution of REPAIR_AGE')
plt.xlabel('Repair Age')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}repair_age_histogram.png', dpi=300)
plt.show()

# Heatmap for Correlation Between Numerical Columns


numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
if len(numerical_columns) > 1:
correlation_matrix = data[numerical_columns].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',
cbar=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}correlation_heatmap.png', dpi=300)
plt.show()

# Pie Chart for TRANSACTION_CATEGORY Distribution


if 'TRANSACTION_CATEGORY' in data.columns:
plt.figure(figsize=(8, 8))
data['TRANSACTION_CATEGORY'].value_counts().plot(kind='pie', autopct='%1.1f
%%', startangle=90)
plt.title('Transaction Category Distribution')
plt.ylabel('')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}transaction_category_piechart.png',
dpi=300)
plt.show()

def main():
"""
Main function to execute data analysis, cleaning, and visualization steps.
"""
try:
# Load the dataset
file_path = r'D:\Downloads(D)\Chrome\Data for Task 1. (1).xlsx'
data = pd.read_excel(file_path) # Load data from Excel file
print("Dataset loaded successfully.")

# Display initial dataset overview


print("\nInitial Dataset Overview:")
print(data.head())
print(data.info())

# Step 1: Perform column-wise analysis


print("\nPerforming column-wise analysis...")
column_wise_analysis(data)

# Step 2: Clean the data


print("\nCleaning data...")
cleaned_data = data_cleaning(data)
print("\nData after cleaning:")
print(cleaned_data.head())

# Step 3: Identify and analyze critical columns


print("\nIdentifying and analyzing critical columns...")
identify_critical_columns(cleaned_data)

# Step 4: Generate tags/features from free text


print("\nGenerating tags from free text (if applicable)...")
generate_tags(cleaned_data, text_column='CAUSAL_PART_NM')

# Step 5: Visualize the data


print("\nVisualizing the data...")
visualize_data(cleaned_data)

# Step 6: Save the cleaned data to a CSV file


output_path = os.path.join(cleaned_data_directory, 'cleaned_data.csv')
cleaned_data.to_csv(output_path, index=False)
print(f"\nCleaned data saved to {output_path}")

except Exception as e:
print(f"An error occurred: {e}")

if __name__ == '__main__':
main()

You might also like