Main.py Text File
Main.py Text File
def column_wise_analysis(data):
"""
Analyzes columns for data types, unique values, and numerical summaries.
"""
print("Data Types:\n", data.dtypes)
def data_cleaning(data):
"""
Cleans the dataset by handling missing values and inconsistencies.
"""
# Handle Missing Values
for column in data.select_dtypes(include=['float64', 'int64']).columns:
if data[column].isnull().any():
data[column] = data[column].fillna(data[column].mean()) # Fill
numerical NaNs with the column mean
# Handle outliers (optional, you can adjust the thresholds or remove outliers
if needed)
for column in data.select_dtypes(include=['float64', 'int64']).columns:
upper_limit = data[column].mean() + 3 * data[column].std()
lower_limit = data[column].mean() - 3 * data[column].std()
data[column] = data[column].clip(lower=lower_limit, upper=upper_limit) #
Clip outliers to within limits
return data
def identify_critical_columns(data):
"""
Identifies and visualizes critical columns in the dataset.
"""
print("Columns in the dataset:", data.columns) # Debugging: Print the columns
if not critical_columns:
print("No critical columns found in the dataset.")
return
def visualize_data(data):
"""
Create various visualizations for the dataset.
"""
# Bar Charts for categorical data
categorical_columns = ['CAUSAL_PART_NM', 'PLATFORM', 'BODY_STYLE',
'DEALER_NAME', 'STATE']
for col in categorical_columns:
if col in data.columns:
plt.figure(figsize=(10, 6))
data[col].value_counts().plot(kind='bar', color='skyblue')
plt.title(f'{col} Distribution')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}{col}_barchart.png', dpi=300)
# Save with high resolution
plt.show()
data.groupby(data['REPAIR_DATE'].dt.to_period('M')).size().plot(kind='line',
marker='o', color='orange')
plt.title('Repairs Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Repairs')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}repair_date_linechart.png',
dpi=300)
plt.show()
def main():
"""
Main function to execute data analysis, cleaning, and visualization steps.
"""
try:
# Load the dataset
file_path = r'D:\Downloads(D)\Chrome\Data for Task 1. (1).xlsx'
data = pd.read_excel(file_path) # Load data from Excel file
print("Dataset loaded successfully.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == '__main__':
main()